def process_user(self, job, js, headers, raw_response): """ Process user detail data :param job: :param js: :param headers: :param raw_response: :return: """ if 'id' not in js: logger.error('Field ID not found in user') return s = self.session() try: user_id = int(js['id']) dbu = s.query(GitHubUserDetails).filter( GitHubUserDetails.id == user_id).one_or_none() is_new = False if dbu is None: is_new = True dbu = GitHubUserDetails() dbu.id = user_id dbu.date_last_check = salch.func.now() dbu.username = js['login'] dbu.name = utils.utf8ize(utils.defvalkey(js, 'name')) dbu.company = utils.utf8ize(utils.defvalkey(js, 'company')) dbu.blog = utils.defvalkey(js, 'blog') dbu.email = utils.defvalkey(js, 'email') dbu.bio = utils.utf8ize(utils.defvalkey(js, 'bio')) dbu.usr_type = utils.defvalkey(js, 'type') dbu.public_repos = js['public_repos'] dbu.public_gists = js['public_gists'] dbu.followers = js['followers'] dbu.following = js['following'] dbu.created_at = utils.dt_norm( utils.try_parse_timestamp(utils.defvalkey(js, 'created_at'))) dbu.updated_at = utils.dt_norm( utils.try_parse_timestamp(utils.defvalkey(js, 'updated_at'))) if is_new: s.add(dbu) else: s.merge(dbu) s.commit() s.flush() s.expunge_all() except Exception as e: logger.error('Exception storing user details: %s: %s' % (js['id'], e)) logger.debug(traceback.format_exc()) finally: utils.silent_close(s)
def process_item(self, item, spider): """ Process item for persisting :param item: :param spider: :return: """ try: s = self.session() if isinstance(item, (PomItem, type(PomItem()), type(PomItem))): self.store_pom(item, s) elif isinstance(item, (AscItem, type(AscItem()), type(AscItem))): self.store_asc(item, s) elif isinstance(item, (ArtifactItem, type(ArtifactItem()), type(ArtifactItem))): self.store_index(item, s) elif isinstance(item, LinkItem): pass else: logger.warning('Unknown item: %s type %s' % (item, type(item))) return s.commit() s.flush() # writes changes to DB s.expunge_all() # removes objects from session except Exception as e: logger.warning('Exception in storing key %s' % e) finally: utils.silent_close(s) s = None return item
def work(self): """ Entry point after argument processing. :return: """ self.config_file = self.args.config self.init_config() self.init_db() s = self.session() js_res = [] try: res = s.query(MavenSignature.sig_key_id).group_by( MavenSignature.sig_key_id).all() for keyrec in res: js_res.append(keyrec.sig_key_id) js_res.sort() print(json.dumps(js_res)) except Exception as e: logger.error('Exception in dump: %s' % e) logger.debug(traceback.format_exc()) finally: utils.silent_close(s)
def process_colab(self, job, js, headers, raw_response): """ Process colaborators for org owned repos :param job: :param js: :param headers: :param raw_response: :return: """ for colab in js: if 'id' not in colab: logger.error('Field ID not found in colab') continue s = self.session() try: # delete first - avoid excs s.query(GitHubRepoColab)\ .filter(GitHubRepoColab.user_name == colab['login'])\ .filter(GitHubRepoColab.repo_name == job.meta['repo'])\ .delete() dbu = GitHubRepoColab() dbu.repo_name = job.meta['repo'] dbu.user_name = colab['login'] dbu.can_pull = colab['permissions']['pull'] dbu.can_push = colab['permissions']['push'] dbu.can_admin = colab['permissions']['admin'] s.add(dbu) s.commit() s.flush() s.expunge_all() except Exception as e: logger.error('Exception storing colab details: %s:%s: %s' % (colab['login'], job.meta['repo'], e)) logger.debug(traceback.format_exc()) finally: utils.silent_close(s) if len(js) == 0: return # Load next page cur_page = utils.defvalkey(job.meta, 'page', 1) new_url = (self.ORG_REPO_COLAB_URL % (job.meta['repo'])) + ('?page=%s' % (cur_page + 1)) new_meta = dict(job.meta) new_meta['page'] = cur_page + 1 job = DownloadJob(url=new_url, jtype=DownloadJob.TYPE_REPO_COLAB, meta=new_meta) self.link_queue.put(job)
def store_users_list(self, users): """ Stores all user in the list :param users :return: """ # Handling gaps in the user space ID. With user-only optimization it causes # overlaps. reduced_by = 0 with self.processed_user_set_lock: ids = [user.user_id for user in users] ids_ok = [] for id in ids: if id in self.processed_user_set: reduced_by += 1 continue self.processed_user_set.add(id) ids_ok.append(id) users = [user for user in users if user.user_id in ids_ok] # Bulk user load s = self.session() id_list = sorted([user.user_id for user in users]) db_users = s.query(GitHubUserDb).filter( GitHubUserDb.id.in_(id_list)).all() db_user_map = {user.id: user for user in db_users} for user in users: self.new_users_events.insert() # Store user to the DB try: db_user = utils.defvalkey(db_user_map, key=user.user_id) self.store_user(user, s, db_user=db_user, db_user_loaded=True) except Exception as e: logger.warning('[%02d] Exception in storing user %s' % (self.local_data.idx, e)) self.trace_logger.log(e) logger.info('[%02d] idlist: %s' % (self.local_data.idx, id_list)) self.trigger_quit() break try: s.commit() # logger.info('[%02d] Commited, reduced by: %s' % (self.local_data.idx, reduced_by)) except Exception as e: logger.warning('[%02d] Exception in storing bulk users' % self.local_data.idx) logger.warning(traceback.format_exc()) logger.info('[%02d] idlist: %s' % (self.local_data.idx, id_list)) self.trigger_quit() finally: utils.silent_close(s)
def process_assignee(self, job, js, headers, raw_response): """ Process assignees for org owned repos :param job: :param js: :param headers: :param raw_response: :return: """ for assignee in js: if 'id' not in assignee: logger.error('Field ID not found in assignees') continue s = self.session() try: # delete first - avoid excs s.query(GitHubRepoAssignee)\ .filter(GitHubRepoAssignee.user_name == assignee['login'])\ .filter(GitHubRepoAssignee.repo_name == job.meta['repo'])\ .delete() dbu = GitHubRepoAssignee() dbu.repo_name = job.meta['repo'] dbu.user_name = assignee['login'] s.add(dbu) s.commit() s.flush() s.expunge_all() except Exception as e: logger.error('Exception storing cassignee details: %s:%s: %s' % (assignee['login'], job.meta['repo'], e)) logger.debug(traceback.format_exc()) finally: utils.silent_close(s) if len(js) == 0: return # Load next page cur_page = utils.defvalkey(job.meta, 'page', 1) new_url = (self.ORG_REPO_ASSIGNEES_URL % (job.meta['repo'])) + ('?page=%s' % (cur_page + 1)) new_meta = dict(job.meta) new_meta['page'] = cur_page + 1 job = DownloadJob(url=new_url, jtype=DownloadJob.TYPE_REPO_ASSIGNEE, meta=new_meta) self.link_queue.put(job)
def flush_sqlite(self, buff): if len(buff) == 0: return if self.sqlite_file is None: return if not self.sqlite_data: return s = self.sqlite_session() for elem in buff: s.merge(elem) logger.debug('Committing %d elems %s' % (len(buff), s)) s.flush() s.commit() utils.silent_close(s)
def _init_queue(self): """ Initializes link queue :return: """ sess = self.session() logger.debug('Loading users...') users_cnt = 0 all_keys = sess.query(GitHubKey).filter( GitHubKey.is_interesting == 1).all() logger.debug('All users loaded') for rec in all_keys: users_cnt += 1 job = DownloadJob(url=self.USER_DETAIL_URL % rec.key_user_found, jtype=DownloadJob.TYPE_USER, meta={ 'user': rec.key_user_found, 'user_id': rec.key_user_id_found }) self.link_queue.put(job) job = DownloadJob(url=self.USER_ORGS_URL % rec.key_user_found, jtype=DownloadJob.TYPE_ORG, meta={ 'user': rec.key_user_found, 'user_id': rec.key_user_id_found }) self.link_queue.put(job) job = DownloadJob(url=self.USER_REPOS_URL % rec.key_user_found, jtype=DownloadJob.TYPE_REPOS_USER, meta={ 'user': rec.key_user_found, 'user_id': rec.key_user_id_found }) self.link_queue.put(job) logger.info('Queue initialized, users cnt: %s' % users_cnt) utils.silent_close(sess)
def fill_user_key_links(self): """ Loads next X users from the database, advances since_id :return: """ # self.since_id s = self.session() try: db_users = s.query(GitHubUserDb)\ .filter(GitHubUserDb.id > self.since_id)\ .order_by(GitHubUserDb.id)\ .limit(self.user_load_bulk)\ .all() for user in db_users: key_url = self.KEYS_ID_URL % user.id github_user = GitHubUser(user_id=user.id, user_name=user.username, user_type=user.usr_type, user_url=self.USER_URL % user.username) new_job = DownloadJob(url=key_url, jtype=DownloadJob.TYPE_KEYS, user=github_user, priority=random.randint(0, 1000), time_added=time.time()) self.link_queue.put(new_job) if user.id > self.since_id: self.since_id = user.id except Exception as e: logger.warning('Exception in loading users: %s' % e) utils.silent_rollback(s) finally: utils.silent_close(s)
def process_keys_data(self, job, js, headers, raw_response): """ Processing key loaded data :param job: :param js: :param headers: :param raw_response: :return: """ js_empty = js is None or len(js) == 0 # Expect failures, commit everything before if self.merge and not js_empty: try: s = self.session() s.commit() except Exception as e: logger.warning('Could not pre-commit: %s' % e) # Store each key. key_ids = [] for key in js: s = None self.new_keys_events.insert() try: s = self.session() self.store_key(job.user, key, s) key_ids.append(int(key['id'])) s.commit() self.assoc_key(job.user.user_id, key['id'], s) s.commit() s.flush() # writes changes to DB s.expunge_all() # removes objects from session except Exception as e: logger.warning('Exception in storing key %s' % e) self.trace_logger.log(e) finally: utils.silent_close(s) s = None # Deassoc lost keys try: s = self.session() self.deassoc_lost_keys(job.user.user_id, key_ids, s) s.commit() except Exception as e: logger.warning('Exception in deassoc for users %s : %s' % (job.user.user_id, e)) self.trace_logger.log(e) finally: utils.silent_close(s) s = None self.on_keys_processed()
def parse_page(self, response): """ General page parser :param response: :return: """ links_visit = set() links = set() for link in LxmlLinkExtractor(allow=(), deny=()).extract_links(response): # Add all links except up link. if link.text != '../': links.add(link.url) # Links extracted from the current page. # Extract links only if landed in the artifact directory. is_artifact = False art_conf = 0 if len(links) < 100: art_conf += 3 versions = [] misc_files = [] for link in links: if link.endswith('/maven-metadata.xml'): is_artifact = True last_segment = link last_slash = link[-1] == '/' if last_slash: last_segment = link[0:-1] last_segment = last_segment.rsplit('/', 1)[1] if self.is_version_folder(last_segment): art_conf += 1 versions.append({'v': last_segment, 'l': self.remove_prefix(link, response.url)}) elif link != response.url: misc_files.append(self.remove_prefix(link, response.url)) # TODO: if non-standard format, download also maven-metadata.xml # Store only artifacts related URLs if is_artifact or art_conf > 5: logger.info('New artifact(%s), confidence(%s): %s' % (is_artifact, art_conf, response.url)) item = ArtifactItem() item['url'] = response.url item['versions'] = versions item['misc_files'] = misc_files item['artifact_detected'] = is_artifact item['confidence'] = art_conf yield item # Generate request for the newest version if is_artifact and len(versions) > 0: cur_sess = None try: cur_sess = self.session() burl = utils.strip_leading_slash(response.url) grp_id, art_id = get_maven_id_from_url(burl) for cur_version in pick_versions([x['v'] for x in versions]): if self.pom_exists(grp_id, art_id, cur_version, cur_sess): continue logger.info('Enqueueing artifact %s %s %s' % (grp_id, art_id, cur_version)) meta = {'burl': burl, 'artifact_id': art_id, 'group_id': grp_id, 'max_version': cur_version} art_url = '%s/%s' % (burl, cur_version) art_base_name = '%s-%s' % (art_id, cur_version) pom_link = '%s/%s.pom' % (art_url, art_base_name) yield Request(pom_link, callback=self.parse_pom, meta=dict(meta)) except Exception as e: logger.debug('Exception in POM exist check: %s, self: %s, sess: %s' % (e, self, self.session)) logger.debug(traceback.format_exc()) utils.silent_close(cur_sess) # Case: maven-metadata is present, but we have also another directories here -> crawl it. # otherwise do not follow any more links from this page. base_url = response.url if base_url[-1] != '/': base_url += '/' links = [base_url + x for x in misc_files if x.endswith('/')] # Links post processing for link in links: if not self.should_follow_link(link, response): continue links_visit.add(link) logger.debug('Extracted %s links from %s' % (len(links_visit), response.url)) for link in list(links_visit): yield Request(link, callback=self.parse_page)
def process_repo(self, job, js, headers, raw_response, from_user): """ Process repo list page :param job: :param js: :param headers: :param raw_response: :param from_user: :return: """ for repo in js: if 'id' not in repo: logger.error('Field ID not found in repos') continue s = self.session() try: repo_id = int(repo['id']) dbe = s.query(GitHubRepo).filter( GitHubRepo.id == repo_id).one_or_none() dbu = GitHubRepo() dbu.id = repo_id dbu.user_repo = from_user if from_user: dbu.username = job.meta['user'] else: dbu.org_name = job.meta['org'] if 'owner' in repo: dbu.owner_id = repo['owner']['id'] dbu.owner_login = repo['owner']['login'] dbu.repo_name = repo['full_name'] dbu.repo_stars = repo['stargazers_count'] dbu.repo_forks = repo['forks'] dbu.repo_watchers = repo['watchers'] dbu.repo_is_fork = repo['fork'] dbu.repo_size = repo['size'] dbu.repo_homepage = utils.defvalkey(repo, 'homepage') dbu.repo_language = utils.defvalkey(repo, 'language') dbu.created_at = utils.dt_norm( utils.try_parse_timestamp( utils.defvalkey(repo, 'created_at'))) dbu.updated_at = utils.dt_norm( utils.try_parse_timestamp( utils.defvalkey(repo, 'updated_at'))) dbu.pushed_at = utils.dt_norm( utils.try_parse_timestamp( utils.defvalkey(repo, 'pushed_at'))) dbu.repo_description = utils.utf8ize(repo['description']) dbu.repo_stargazers_url = repo['stargazers_url'] dbu.repo_forks_url = repo['forks_url'] if not from_user and repo['stargazers_count'] > 100: new_meta = dict(job.meta) new_meta['page'] = 1 new_meta['repo'] = repo['full_name'] new_meta['owner'] = repo['owner']['login'] # Colab fetch - skip, no auth job = DownloadJob(url=self.ORG_REPO_COLAB_URL % (repo['full_name']), jtype=DownloadJob.TYPE_REPO_COLAB, meta=new_meta) # Asignee fetch job = DownloadJob(url=self.ORG_REPO_ASSIGNEES_URL % (repo['full_name']), jtype=DownloadJob.TYPE_REPO_ASSIGNEE, meta=dict(new_meta)) self.link_queue.put(job) # DB save if dbe is None: s.add(dbu) else: if dbe.username != dbu.username: logger.warning('Username does not match for %s %s %s' % (repo_id, dbe.username, dbu.username)) if dbe.org_name != dbu.org_name: logger.warning('org_name does not match for %s %s %s' % (repo_id, dbe.org_name, dbu.org_name)) if dbe.owner_login != dbu.owner_login: logger.warning( 'owner_login does not match for %s %s %s' % (repo_id, dbe.owner_login, dbu.owner_login)) s.commit() s.flush() s.expunge_all() except Exception as e: logger.error( 'Exception storing repo details: %s:%s meta: %s, url: %s, exc: %s' % (repo['id'], repo['full_name'], json.dumps( job.meta), job.url, e)) logger.debug(traceback.format_exc()) finally: utils.silent_close(s) if len(js) == 0: return # Load next page cur_page = utils.defvalkey(job.meta, 'page', 1) new_meta = dict(job.meta) new_meta['page'] = cur_page + 1 if from_user: new_url = (self.USER_REPOS_URL % job.meta['user']) + ('?page=%s' % (cur_page + 1)) job = DownloadJob(url=new_url, jtype=DownloadJob.TYPE_REPOS_USER, meta=new_meta) else: new_url = (self.ORG_REPOS_URL % job.meta['org']) + ('?page=%s' % (cur_page + 1)) job = DownloadJob(url=new_url, jtype=DownloadJob.TYPE_REPOS_ORG, meta=new_meta) self.link_queue.put(job)
def process_org(self, job, js, headers, raw_response): """ Process user -> orgs data :param job: :param js: :param headers: :param raw_response: :return: """ new_orgs = [] for org in js: if 'id' not in org: logger.error('Field ID not found in orgs') continue s = self.session() try: org_id = int(org['id']) # delete first - avoid excs s.query(GitHubUserOrgs)\ .filter(GitHubUserOrgs.org_id == org_id)\ .filter(GitHubUserOrgs.username == job.meta['user'])\ .delete() dbu = GitHubUserOrgs() dbu.username = job.meta['user'] dbu.org_id = org['id'] dbu.org_name = org['login'] dbu.org_desc = utils.utf8ize(org['description']) new_orgs.append(org['login']) s.add(dbu) s.commit() s.flush() s.expunge_all() except Exception as e: logger.error('Exception storing user->org details: %s: %s' % (org['id'], e)) logger.debug(traceback.format_exc()) finally: utils.silent_close(s) if len(js) == 0: return # Load next page cur_page = utils.defvalkey(job.meta, 'page', 1) new_url = (self.USER_ORGS_URL % job.meta['user']) + ('?page=%s' % (cur_page + 1)) new_meta = dict(job.meta) new_meta['page'] = cur_page + 1 job = DownloadJob(url=new_url, jtype=DownloadJob.TYPE_ORG, meta=new_meta) self.link_queue.put(job) # Load repositories for new organisations not_loaded_orgs = None with self.orgs_loaded_lock: new_orgs_set = set(new_orgs) not_loaded_orgs = new_orgs_set - self.orgs_loaded_set for x in new_orgs: self.orgs_loaded_set.add(x) for x in not_loaded_orgs: new_meta = dict(job.meta) new_meta['page'] = 1 new_meta['org'] = x job = DownloadJob(url=self.ORG_REPOS_URL % x, jtype=DownloadJob.TYPE_REPOS_ORG, meta=new_meta) self.link_queue.put(job)