def get_data(self): global resume_stage scream.say('Executing inside-thread method get_data() for: ' + str(self.threadId)) if resume_stage in [None, 'contributors']: #try: scream.ssay('Checking size of a ' + str(self.repo.getKey()) + ' team') '1. Team size of a repository' self.contributors = self.repository.get_contributors() assert self.contributors is not None self.repo_contributors = set() self.contributors_static = self.build_list_of_programmers(self.contributors, self.repo.getKey(), self.repository) for contributor in self.contributors_static.items(): scream.log_debug('move with contributor to next from contributors_static.items()', True) while True: scream.say('Inside while True: (line 674)') try: self.contributor_login = contributor[0] self.contributor_object = contributor[1] scream.say(str(self.contributor_login)) self.repo_contributors.add(self.contributor_login) scream.say(str(self.repo_contributors)) #developer_revealed(threading.current_thread(), self.repository, self.repo, self.contributor_object) developer_revealed(self.threadId, self.repository, self.repo, self.contributor_object) scream.say('Finished revealing developer') break except TypeError as e: scream.log_error('Repo + Contributor TypeError, or paginated through' + ' contributors gave error. ' + key + ', error({0})'. format(str(e)), True) repos_reported_execution_error.write(key + os.linesep) if force_raise: raise #break except socket.timeout as e: scream.log_error('Timeout while revealing details.. ' + ', error({0})'.format(str(e)), True) freeze('socket.timeout in paginate through x contributors') if force_raise: raise #break except Exception as e: scream.log_error('Exception while revealing details.. ' + ', error({0})'.format(str(e)), True) freeze(str(e) + ' in paginate through x contributors') if force_raise: raise #break assert self.repo_contributors is not None self.repo.setContributors(self.repo_contributors) self.repo.setContributorsCount(len(self.repo_contributors)) scream.log('Added contributors of count: ' + str(len(self.repo_contributors)) + ' to a repo ' + key) self.cleanup()
def get_data(self): global resume_stage scream.say('Preparing to build list of programmers: ' + str(self.threadId)) if resume_stage in [None, 'contributors']: #try: scream.ssay('Checking size of a ' + str(self.repo.getKey()) + ' team') '1. Team size of a repository' self.contributors = self.repository.get_contributors() assert self.contributors is not None self.repo_contributors = set() self.contributors_static = self.build_list_of_programmers(self.contributors, self.repo.getKey(), self.repository) for contributor in self.contributors_static.items(): scream.log_debug('move with contributor to next from contributors_static.items()', True) while True: scream.say('Get details for a contributor..') try: self.contributor_login = contributor[0] self.contributor_object = contributor[1] scream.say(str(self.contributor_login)) self.repo_contributors.add(self.contributor_login) scream.say(str(self.repo_contributors)) #developer_revealed(threading.current_thread(), self.repository, self.repo, self.contributor_object) developer_revealed(self.threadId, self.repository, self.repo, self.contributor_object) scream.say('Finished revealing developer') break except TypeError as e: scream.log_error('Repo + Contributor TypeError, or paginated through' + ' contributors gave error. ' + key + ', error({0})'. format(str(e)), True) repos_reported_execution_error.write(key + os.linesep) if force_raise: raise #break except socket.timeout as e: scream.log_error('Timeout while revealing details.. ' + ', error({0})'.format(str(e)), True) freeze('socket.timeout in paginate through x contributors') if force_raise: raise #break except Exception as e: scream.log_error('Exception while revealing details.. ' + ', error({0})'.format(str(e)), True) freeze(str(e) + ' in paginate through x contributors') if force_raise: raise #break assert self.repo_contributors is not None self.repo.setContributors(self.repo_contributors) self.repo.setContributorsCount(len(self.repo_contributors)) scream.log('Added contributors of count: ' + str(len(self.repo_contributors)) + ' to a repo ' + key) self.cleanup()
def analyze_with_selenium(self, repository): result = dict() scream.say('Starting webinterpret for ' + repository.html_url + '..') assert repository is not None url = repository.html_url assert url is not None while True: try: self.browser.set_page_load_timeout(15) self.browser.get(url) scream.say('Data from web retrieved') doc = html.document_fromstring(unicode(self.browser.page_source)) scream.log_debug(str(url), True) scream.say('Continue to work on ' + url) scream.say('Page source sent further') scream.say('Verify if 404 (repo deleted) otherwise keep on going') parallax = doc.xpath('//div[@id="parallax_illustration"]') if (len(parallax) > 0): scream.say('Verified that 404 (repo deleted)') result['status'] = '404' break scream.say('Verified that not 404') scream.say('Verify if repo empty otherwise keep on going') repo_empty = doc.xpath('//div[@class="blankslate has-fixed-width"]') if (len(repo_empty) > 0): scream.say('Verified that repo is empty') result['status'] = 'EMPTY' break scream.say('Verified that repo not empty') ns = doc.xpath('//ul[@class="numbers-summary"]') sunken = doc.xpath('//ul[@class="sunken-menu-group"]') scream.say('XPath made some search for ' + url + ' .. move on to bsoup..') scream.say('Xpath done searching') scream.say('Element found?: ' + str(len(ns) == 1)) element = ns[0] element_sunken = sunken[0] local_soup = BeautifulSoup(etree.tostring(element)) local_soup_sunken = BeautifulSoup(etree.tostring(element_sunken)) enumarables = local_soup.findAll("li") enumarables_more = local_soup_sunken.findAll("li") commits = enumarables[0] scream.say('enumarables[0]') commits_number = analyze_tag(commits.find("span", {"class": "num"})) scream.say('analyze_tag finished execution for commits_number') scream.say('Before parse number: ' + str(commits_number)) result['commits'] = parse_number(commits_number) scream.log_debug(result['commits'], True) scream.say('enumarables[1]') branches = enumarables[1] branches_number = analyze_tag(branches.find("span", {"class": "num"})) scream.say('Before parse number: ' + str(branches_number)) result['branches'] = parse_number(branches_number) scream.log_debug(result['branches'], True) scream.say('enumarables[2]') releases = enumarables[2] releases_number = analyze_tag(releases.find("span", {"class": "num"})) scream.say('Before parse number: ' + str(releases_number)) result['releases'] = parse_number(releases_number) scream.log_debug(result['releases'], True) scream.say('enumarables[3]') contributors = enumarables[3] contributors_number = analyze_tag(contributors.find("span", {"class": "num"})) scream.say('Before parse number: ' + str(contributors_number)) result['contributors'] = parse_number(contributors_number) scream.log_debug(result['contributors'], True) result['issues'] = 0 result['pulls'] = 0 for enumerable___ in enumarables_more: if enumerable___["aria-label"] == "Pull Requests": pulls_tag = enumerable___ pulls_number = analyze_tag(pulls_tag.find("span", {"class": "counter"})) scream.say('Before parse number: ' + str(pulls_number)) result['pulls'] = parse_number(pulls_number) elif enumerable___["aria-label"] == "Issues": issues_tag = enumerable___ issues_number = analyze_tag(issues_tag.find("span", {"class": "counter"})) scream.say('Before parse number: ' + str(issues_number)) result['issues'] = parse_number(issues_number) result['status'] = 'OK' break except TypeError as ot: scream.say(str(ot)) scream.say('Scrambled results (TypeError). Maybe GitHub down. Retry') time.sleep(5.0) if force_raise: raise except Exception as e: scream.say(str(e)) scream.say('No response from selenium. Retry') time.sleep(2.0) if force_raise: raise assert 'status' in result return result
def developer_revealed(thread_getter_instance, repository, repo, contributor): global result_writer global result_punch_card_writer assert result_punch_card_writer is not None developer_login = contributor.login scream.log_debug('Assigning a contributor: ' + str(developer_login) + ' to a repo: ' + str(repository.name), True) developer_name = contributor.name # 1 Ilosc osob, ktore dany deweloper followuje [FollowEvent] developer_followers = contributor.followers # 2 Ilosc osob, ktore followuja dewelopera [FollowEvent] developer_following = contributor.following developer_location = contributor.location developer_total_private_repos = contributor.total_private_repos developer_total_public_repos = contributor.public_repos # 5. Ilosc repo, ktorych nie tworzyl, w ktorych jest team member [TeamAddEvent] [MemberEvent] developer_collaborators = contributor.collaborators # 6. Ilosc repo, ktorych nie tworzyl, w ktorych jest contributorem [PushEvent] [IssuesEvent] [PullRequestEvent] [GollumEvent] developer_contributions = contributor.contributions # - Ilosc projektow przez niego utworzonych his_repositories = contributor.get_repos() while True: total_his_repositories = 0 total_his_stars = 0 total_his_watchers = 0 total_his_forks = 0 total_his_has_issues = 0 total_his_has_wiki = 0 total_his_open_issues = 0 total_network_count = 0 total_his_collaborators = 0 total_his_contributors = 0 total_his_commits = 0 total_his_branches = 0 total_his_releases = 0 total_his_issues = 0 total_his_pull_requests = 0 total_his_commits = 'N/A' total_his_branches = 'N/A' total_his_releases = 'N/A' total_his_issues = 'N/A' total_his_pull_requests = 'N/A' total_his_contributors = 'N/A' try: for his_repo in his_repositories: total_his_repositories += 1 total_his_forks += his_repo.forks_count total_his_stars += his_repo.stargazers_count total_his_watchers += his_repo.watchers_count total_his_has_issues += 1 if his_repo.has_issues else 0 total_his_has_wiki += 1 if his_repo.has_wiki else 0 total_his_open_issues += his_repo.open_issues total_network_count += his_repo.network_count try: stats = his_repo.get_stats_contributors() for s in stats: ad___c = 0 ad___a = 0 ad___d = 0 for w in s.weeks: ad___c += w.c ad___a += w.a ad___d += w.d result_punch_card_writer.writerow([str(his_repo.owner.login), str(his_repo.name), str(developer_login), str(s.author.login), str(s.total), str(ad___c), str(ad___a), str(ad___d)]) except GithubException as e: freeze(str(e) + ' in try per repo of x-dev repos') if ("message" in e.data) and (e.data["message"].strip() == "Repository access blocked"): scream.log_debug("It is a private repo.. Skip!") continue if force_raise: raise except Exception as e: freeze(str(e) + ' in try per repo of x-dev repos') # probably punch card not ready if force_raise: raise break except Exception as e: freeze(str(e) + ' in main loop of developer_revealed()') his_repositories = contributor.get_repos() if force_raise: raise # Developer company (if any given) company = contributor.company created_at = contributor.created_at # Does the developer want to be hired? hireable = contributor.hireable scream.log_debug('Thread ' + str(thread_getter_instance.threadId) + ' Finished revealing contributor: ' + str(developer_login) + ' in a repo: ' + str(repository.name), True) if show_trace: scream.log_debug('Printing traceback stack', True) traceback.print_stack() scream.log_debug('Printing traceback exc pathway', True) traceback.print_exc() if not use_utf8: result_writer.writerow([str(repo.getUrl()), str(repo.getName()), str(repo.getOwner()), str(repo.getStargazersCount()), str(repo.getWatchersCount()), str(developer_login), (str(developer_name) if developer_name is not None else ''), str(developer_followers), str(developer_following), str(developer_collaborators), (str(company) if company is not None else ''), str(developer_contributions), str(created_at), (str(hireable) if hireable is not None else ''), str(total_his_repositories), str(total_his_stars), str(total_his_collaborators), str(total_his_contributors), str(total_his_watchers), str(total_his_forks), str(total_his_has_issues), str(total_his_has_wiki), str(total_his_open_issues), str(total_network_count), (str(developer_location) if developer_location is not None else ''), str(developer_total_private_repos), str(developer_total_public_repos), str(total_his_issues), str(total_his_pull_requests)]) else: result_writer.writerow([repo.getUrl(), repo.getName(), repo.getOwner(), str(repo.getStargazersCount()), str(repo.getWatchersCount()), developer_login, (developer_name if developer_name is not None else ''), str(developer_followers), str(developer_following), str(developer_collaborators), (company if company is not None else ''), str(developer_contributions), str(created_at), (str(hireable) if hireable is not None else ''), str(total_his_repositories), str(total_his_stars), str(total_his_collaborators), str(total_his_contributors), str(total_his_watchers), str(total_his_forks), str(total_his_has_issues), str(total_his_has_wiki), str(total_his_open_issues), str(total_network_count), (developer_location if developer_location is not None else ''), str(developer_total_private_repos), str(developer_total_public_repos), str(total_his_issues), str(total_his_pull_requests)]) scream.log_debug('Wrote row to CSV.', True)
(resume_on_repo_owner == repo.getOwner())): iteration_step_count += 1 continue else: resume_on_repo = None iteration_step_count += 1 if resume_on_repo_inclusive: scream.say('Not skipping the ' + str(resume_on_repo_name)) else: scream.say('Starting from the next from ' + str(resume_on_repo_name)) continue try: while True: if show_trace: scream.log_debug('Printing traceback stack', True) traceback.print_stack() scream.log_debug('Printing traceback exc pathway', True) traceback.print_exc() #scream.log_warning(inspect.getargvalues(sys.exc_info()[2].tb_frame)) scream.say('Creating Repository.py instance from API result..') scream.say('Working at the moment on repo: ' + str(repo.getKey())) current_ghc = github_clients[num_modulo(thread_id_count)] current_ghc_desc = github_clients_ids[num_modulo(thread_id_count)] repository = current_ghc.get_repo(repo.getKey()) scream.log_debug('Got a repository from API', True) repo.setRepoObject(repository) repo.setStargazersCount(repository.stargazers_count) scream.say('There are ' + str(repo.getStargazersCount()) + ' stargazers.') assert repo.getStargazersCount() is not None repo.setWatchersCount(repository.watchers_count) # PyGithub must be joking, this works, watchers_count not
def execute_check(): parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", help="verbose messaging ? [True/False]", action="store_true") args = parser.parse_args() if args.verbose: scream.intelliTag_verbose = True scream.say("verbosity turned on") threads = [] # init connection to database first_conn = MSQL.connect(host=IP_ADDRESS, port=3306, user=open('mysqlu.dat', 'r').read(), passwd=open('mysqlp.dat', 'r').read(), db="github", connect_timeout=50000000, charset='utf8', init_command='SET NAMES UTF8', use_unicode=True) print 'Testing MySql connection...' print 'Pinging database: ' + (str(first_conn.ping(True)) if first_conn.ping(True) is not None else 'NaN') cursor = first_conn.cursor() cursor.execute( r'SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = "%s"' % 'github') rows = cursor.fetchall() print 'There are: ' + str( rows[0][0]) + ' table objects in the local GHtorrent copy' cursor.execute( r'SELECT table_name FROM information_schema.tables WHERE table_schema = "%s"' % 'github') rows = cursor.fetchall() if (u'users', ) and (u'projects', ) in rows: print 'All neccesary tables are there.' else: print 'Your database does not fit a typical description of a GitHub Torrent copy..' sys.exit(0) sample_tb_name = raw_input( "Please enter table/view name (of chosen data sample): ") cursor.execute(r'select count(distinct name) from ' + str(sample_tb_name) + ' where ((name is not NULL) and (gender is NULL))') rows = cursor.fetchall() record_count = rows[0][0] cursor.close() scream.say( "Database seems to be working. Move on to getting list of users.") # populate list of users to memory cursor = first_conn.cursor() is_locked_tb = raw_input("Should I update [users] table instead of [" + str(sample_tb_name) + "]? [y/n]: ") is_locked_tb = True if is_locked_tb in ['yes', 'y'] else False print 'Querying all names from the observations set.. This can take around 25-30 sec.' cursor.execute(r'select distinct name from ' + str(sample_tb_name) + ' where ((name is not NULL) and (gender is NULL))') # if you are interested in how this table was created, you will probably need to read our paper and contact us as well # because we have some more tables with aggregated data compared to standard GitHub Torrent collection row = cursor.fetchone() iterator = 1.0 min_name_length = 2 print 'We hypothetize that minimum name length are ' \ + str(min_name_length) + ' characters, like Ho, Sy, Lu' # http://www.answers.com/Q/What_is_the_shortest_name_in_the_world while row is not None: fullname = unicode(row[0]) scream.log("\tFullname is: " + str(fullname.encode('unicode_escape'))) iterator += 1 print "[Progress]: " + str( (iterator / record_count) * 100) + "% ----------- " # [names] size: " + str(len(names)) if len(fullname) < min_name_length: scream.log_warning( "--Found too short name field (" + str(fullname.encode('utf-8')) + ") from DB. Skipping..", True) row = cursor.fetchone() continue name = fullname.split()[0] # I find it quite uncommon to seperate name from surname with something else than a space # it does occur, but it's not in my interest to detect such human-generated dirty data at the moment scream.log("\tName is: " + str(name.encode('unicode_escape'))) if name in names: if fullname in names[name]['persons']: scream.say( "\tSuch fullname already classified! Rare, but can happen. Move on." ) else: scream.say( "\tAdding fullname to already classified name. Move on") names[name]['persons'].append(fullname) else: scream.say("\tNew name. Lets start classification.") names[name] = {'persons': list(), 'classification': None} names[name]['persons'].append(fullname) scream.say("\tStart the worker on name: " + str(name.encode('utf-8')) + " as deriven from: " + str(fullname.encode('utf-8'))) # start the worker gg = GeneralGetter(int(iterator), name) scream.say('Creating instance of GeneralGetter complete') scream.say('Appending thread to collection of threads') threads.append(gg) scream.say('Append complete, threads[] now have size: ' + str(len(threads))) scream.log_debug( 'Starting thread ' + str(int(iterator) - 1) + '....', True) gg.start() while (num_working(threads) > 3): time.sleep( 0.2 ) # sleeping for 200 ms - there are already 3 active threads.. row = cursor.fetchone() cursor.close() print "Finished getting gender data, moving to database update..." for key in names.keys(): collection = names[key] gender = collection['classification'] for fullname in names[key]['persons']: cursor = first_conn.cursor() update_query = r'UPDATE {2} SET gender = {0} where name = "{1}"'.format( gender, fullname.encode('utf-8').replace('"', '\\"'), 'users' if is_locked_tb else sample_tb_name) print update_query cursor.execute(update_query) cursor.close() first_conn.close()
def get_data(self, page, conn): global results_done global results_all global pagination global openhub_query_tags self.params_sort_rating = urllib.urlencode({'query': 'tag:' + openhub_query_tags[0], 'api_key': return_random_openhub_key(), 'sort': 'rating', 'page': page}) self.projects_api_url = "https://www.openhub.net/projects.xml?%s" % (self.params_sort_rating) self.result_flow = urllib.urlopen(self.projects_api_url) scream.say('') scream.say('-------------------------- PAGE ' + str(page) + ' parsed -----------------------------') scream.say('') # Parse the response into a structured XML object self.tree = ET.parse(self.result_flow) # Did Ohloh return an error? self.elem = self.tree.getroot() self.error = self.elem.find("error") if self.error is not None: print 'OpenHub returned ERROR:', ET.tostring(self.error), sys.exit() results_done += int(self.elem.find("items_returned").text) results_all = int(self.elem.find("items_available").text) self.i = 0 for self.node in self.elem.findall("result/project"): self.i += 1 scream.say('Checking element ' + str(self.i) + '/' + str(pagination)) self.project_id = self.node.find("id").text self.project_name = self.node.find("name").text self.project_url = self.node.find("url").text self.project_htmlurl = self.node.find("html_url").text self.project_created_at = self.node.find("created_at").text self.project_updated_at = self.node.find("updated_at").text self.project_homepage_url = self.node.find("homepage_url").text self.project_average_rating = self.node.find("average_rating").text self.project_rating_count = self.node.find("rating_count").text self.project_review_count = self.node.find("review_count").text self.project_activity_level = self.node.find("project_activity_index/value").text self.project_user_count = self.node.find("user_count").text # project may have multiple GitHub repositories # or even it may be not present on GitHub - check that self.is_github_project = False self.github_repo_id = None # in case of multiple github CODE repositories (quite often) # treat as a seperate repo - remember, we focus on github repositories, not aggregates self.enlistments_detailed_params = urllib.urlencode({'api_key': return_random_openhub_key()}) self.enlistments_detailed_url = "https://www.openhub.net/projects/%s/enlistments.xml?%s" % (self.project_id, self.enlistments_detailed_params) self.enlistments_result_flow = urllib.urlopen(self.enlistments_detailed_url) # Parse the response into a structured XML object self.enlistments_tree = ET.parse(self.enlistments_result_flow) # Did Ohloh return an error? self.enlistments_elem = self.enlistments_tree.getroot() self.enlistments_error = self.enlistments_elem.find("error") if self.enlistments_error is not None: print 'Ohloh returned:', ET.tostring(self.enlistments_error), sys.exit() self.repos_lists = list() for self.enlistment_node in self.enlistments_elem.findall("result/enlistment"): self.ee_type = self.enlistment_node.find("repository/type").text if (self.ee_type == "GitRepository"): self.ee_link = self.enlistment_node.find("repository/url").text if (self.ee_link.startswith("git://github.com/")): scream.say('Is a GitHub project!') self.is_github_project = True self.github_repo_id = self.ee_link.split("git://github.com/")[1].split(".git")[0] scream.say(self.github_repo_id) self.repos_lists.append(self.github_repo_id) if not self.is_github_project: continue # now lets get even more sophisticated details self.params_detailed_url = urllib.urlencode({'api_key': return_random_openhub_key()}) self.project_detailed_url = "https://www.openhub.net/projects/%s.xml?%s" % (self.project_id, self.params_detailed_url) # how come here was a typo ? self.detailed_result_flow = urllib.urlopen(self.project_detailed_url) # Parse the response into a structured XML object self.detailed_tree = ET.parse(self.detailed_result_flow) # Did Ohloh return an error? self.detailed_elem = self.detailed_tree.getroot() self.detailed_error = self.detailed_elem.find("error") if self.detailed_error is not None: print 'Ohloh returned:', ET.tostring(self.detailed_error), sys.exit() self.twelve_month_contributor_count = self.detailed_elem.find("result/project/analysis/twelve_month_contributor_count").text self.total_contributor_count = self.detailed_elem.find("result/project/analysis/total_contributor_count").text self.twelve_month_commit_count = self.detailed_elem.find("result/project/analysis/twelve_month_commit_count") self.twelve_month_commit_count = self.twelve_month_commit_count.text if self.twelve_month_commit_count is not None else NullChar self.total_commit_count = self.detailed_elem.find("result/project/analysis/total_commit_count") self.total_commit_count = self.total_commit_count.text if self.total_commit_count is not None else NullChar self.total_code_lines = self.detailed_elem.find("result/project/analysis/total_code_lines") self.total_code_lines = self.total_code_lines.text if self.total_code_lines is not None else NullChar self.main_language_name = self.detailed_elem.find("result/project/analysis/main_language_name") self.main_language_name = self.main_language_name.text if self.main_language_name is not None else NullChar self.current_ghc = github_clients[num_modulo(self.i-1)] self.current_ghc_desc = github_clients_ids[num_modulo(self.i-1)] print 'Now using github client id: ' + str(self.current_ghc_desc) for self.gh_entity in self.repos_lists: try: self.repository = self.current_ghc.get_repo(self.gh_entity) self.repo_name = self.repository.name self.repo_full_name = self.repository.full_name self.repo_html_url = self.repository.html_url self.repo_stargazers_count = self.repository.stargazers_count self.repo_forks_count = self.repository.forks_count self.repo_created_at = self.repository.created_at self.repo_is_fork = self.repository.fork self.repo_has_issues = self.repository.has_issues self.repo_open_issues_count = self.repository.open_issues_count self.repo_has_wiki = self.repository.has_wiki self.repo_network_count = self.repository.network_count self.repo_pushed_at = self.repository.pushed_at self.repo_size = self.repository.size self.repo_updated_at = self.repository.updated_at self.repo_watchers_count = self.repository.watchers_count # Now its time to get the list of developers! # yay! rec-09 mysql instance is visible from the yoshimune computer ! # ok, but I forgot github blacklisted our comptuing clusters # make sure your local win machine runs it.. # just pjatk things.. carry on scream.say('Retrieving the project id from mysql database.. should take max 1 second.') # Get here project id used in the database ! #conn.ping(True) self.cursor = conn.cursor() self.cursor.execute(r'select distinct id from (select * from projects where `name`="{0}") as p where url like "%{1}"'.format(self.repo_name, self.repo_full_name)) self.rows = self.cursor.fetchall() try: self.repo_db_id = self.rows[0] except: #print str(cursor.info()) # this is too new repo , because it is not found on mysql db, skip it ! continue #print 'Faulty query was: -------- ' #print r'select distinct id from (select * from projects where `name`="{0}") as p where url like "%{1}"'.format(self.repo_name, self.repo_full_name) scream.say('project id retrieved from database is: ' + str(self.repo_db_id)) self.cursor.close() #conn.ping(True) self.cursor = conn.cursor() # Now get list of GitHub logins which are project_members ! self.cursor.execute(r'SELECT login FROM project_members INNER JOIN users ON users.id = project_members.user_id WHERE repo_id = %s' % self.repo_db_id) self.project_developers = self.cursor.fetchall() self.project_developers = [i[0] for i in self.project_developers] # unzipping tuples in tuples self.contributors_count = len(self.project_developers) self.cursor.close() #conn.close() for self.project_developer in self.project_developers: # create a GitHub user named object for GitHub API self.current_user = self.current_ghc.get_user(self.project_developer) self.current_user_bio = self.current_user.bio self.current_user_blog = self.current_user.blog self.current_user_collaborators = self.current_user.collaborators self.current_user_company = self.current_user.company self.current_user_contributions = self.current_user.contributions self.current_user_created_at = self.current_user.created_at self.current_user_followers = self.current_user.followers self.current_user_following = self.current_user.following self.current_user_hireable = self.current_user.hireable self.current_user_login = self.current_user.login self.current_user_name = self.current_user.name self.developer_login = self.project_developer # Does he commit during business hours? scream.log_debug("Starting to analyze OSRC card for user: "******"Histogram for hours for user: "******"PushEvent"): self.developer_all_pushes += self.usage_element['total'] elif (self.usage_element['type'] == "WatchEvent"): self.developer_all_stars_given += self.usage_element['total'] elif (self.usage_element['type'] == "CreateEvent"): self.developer_all_creations += self.usage_element['total'] elif (self.usage_element['type'] == "IssuesEvent"): self.developer_all_issues_created += self.usage_element['total'] elif (self.usage_element['type'] == "PullRequestEvent"): self.developer_all_pull_requests += self.usage_element['total'] # ----------------------------------------------------------------------- scream.log_debug('Finished analyze OSRC card for user: '******'OSRC gave error, probably 404') scream.say('try ' + str(self.tries) + ' more times') self.tries -= 1 finally: if self.tries < 1: self.developer_works_during_bd = None self.developer_works_period = 0 break self.collection = [str(((page-1)*pagination) + self.i), self.gh_entity, self.repo_full_name, self.repo_html_url, str(self.repo_forks_count), str(self.repo_stargazers_count), str(self.contributors_count), str(self.repo_created_at), str(self.repo_is_fork), str(self.repo_has_issues), str(self.repo_open_issues_count), str(self.repo_has_wiki), str(self.repo_network_count), str(self.repo_pushed_at), str(self.repo_size), str(self.repo_updated_at), str(self.repo_watchers_count), self.project_id, self.project_name, self.project_url, self.project_htmlurl, str(self.project_created_at), str(self.project_updated_at), self.project_homepage_url, str(self.project_average_rating), str(self.project_rating_count), str(self.project_review_count), self.project_activity_level, str(self.project_user_count), str(self.twelve_month_contributor_count), str(self.total_contributor_count), str(self.twelve_month_commit_count), str(self.total_commit_count), str(self.total_code_lines), self.main_language_name, str(self.developer_works_during_bd), str(self.developer_works_period), str(self.developer_all_pushes), str(self.developer_all_stars_given), str(self.developer_all_creations), str(self.developer_all_issues_created), str(self.developer_all_pull_requests)] csv_writer.writerow(self.collection) #self.set_finished(True) print '.' except UnknownObjectException: print 'Repo ' + self.gh_entity + ' is not available anymore..' except GithubException: # TODO: write here something clever raise self.set_finished(True)
'stargazers_count', 'contributors_count', 'repo_created_at', 'repo_is_fork', 'repo_has_issues', 'repo_open_issues_count', 'repo_has_wiki', 'repo_network_count', 'repo_pushed_at', 'repo_size', 'repo_updated_at', 'repo_watchers_count', 'project_id', 'project_name', 'project_url', 'project_htmlurl', 'project_created_at', 'project_updated_at', 'project_homepage_url', 'project_average_rating', 'project_rating_count', 'project_review_count', 'project_activity_level', 'project_user_count', 'twelve_month_contributor_count', 'total_contributor_count', 'twelve_month_commit_count', 'total_commit_count', 'total_code_lines', 'main_language_name', 'developer_works_during_bd', 'developer_works_period', 'developer_all_pushes', 'developer_all_stars_given', 'developer_all_creations', 'developer_all_issues_created', 'developer_all_pull_requests'] if force_add_excelsep: csv_writer.writerow(sepinfo) csv_writer.writerow(headers) Github(login_or_token=credential['pass'], client_id=credential['client_id'], client_secret=credential['client_secret'], user_agent=credential['login'], timeout=timeout) while (results_done < results_all): # Connect to the Ohloh website and retrieve the account data. page += 1 gg = GeneralGetter(thread_id_count, page) scream.say('Creating instance of GeneralGetter complete') scream.say('Appending thread to collection of threads') threads.append(gg) scream.say('Append complete, threads[] now have size: ' + str(len(threads))) thread_id_count += 1 scream.log_debug('Starting thread ' + str(thread_id_count-1) + '....', True) gg.start() while (num_working(threads) > 7): time.sleep(0.2) # sleeping for 200 ms - there are already 8 active threads..
def analyze_with_selenium(self, repository): result = dict() scream.say('Starting webinterpret for ' + repository.html_url + '..') assert repository is not None url = repository.html_url assert url is not None while True: try: self.browser.set_page_load_timeout(15) self.browser.get(url) scream.say('Data from web retrieved') doc = html.document_fromstring( unicode(self.browser.page_source)) scream.log_debug(str(url), True) scream.say('Continue to work on ' + url) scream.say('Page source sent further') scream.say( 'Verify if 404 (repo deleted) otherwise keep on going') parallax = doc.xpath('//div[@id="parallax_illustration"]') if (len(parallax) > 0): scream.say('Verified that 404 (repo deleted)') result['status'] = '404' break scream.say('Verified that not 404') scream.say('Verify if repo empty otherwise keep on going') repo_empty = doc.xpath( '//div[@class="blankslate has-fixed-width"]') if (len(repo_empty) > 0): scream.say('Verified that repo is empty') result['status'] = 'EMPTY' break scream.say('Verified that repo not empty') ns = doc.xpath('//ul[@class="numbers-summary"]') sunken = doc.xpath('//ul[@class="sunken-menu-group"]') scream.say('XPath made some search for ' + url + ' .. move on to bsoup..') scream.say('Xpath done searching') scream.say('Element found?: ' + str(len(ns) == 1)) element = ns[0] element_sunken = sunken[0] local_soup = BeautifulSoup(etree.tostring(element)) local_soup_sunken = BeautifulSoup( etree.tostring(element_sunken)) enumarables = local_soup.findAll("li") enumarables_more = local_soup_sunken.findAll("li") commits = enumarables[0] scream.say('enumarables[0]') commits_number = analyze_tag( commits.find("span", {"class": "num"})) scream.say('analyze_tag finished execution for commits_number') scream.say('Before parse number: ' + str(commits_number)) result['commits'] = parse_number(commits_number) scream.log_debug(result['commits'], True) scream.say('enumarables[1]') branches = enumarables[1] branches_number = analyze_tag( branches.find("span", {"class": "num"})) scream.say('Before parse number: ' + str(branches_number)) result['branches'] = parse_number(branches_number) scream.log_debug(result['branches'], True) scream.say('enumarables[2]') releases = enumarables[2] releases_number = analyze_tag( releases.find("span", {"class": "num"})) scream.say('Before parse number: ' + str(releases_number)) result['releases'] = parse_number(releases_number) scream.log_debug(result['releases'], True) scream.say('enumarables[3]') contributors = enumarables[3] contributors_number = analyze_tag( contributors.find("span", {"class": "num"})) scream.say('Before parse number: ' + str(contributors_number)) result['contributors'] = parse_number(contributors_number) scream.log_debug(result['contributors'], True) result['issues'] = 0 result['pulls'] = 0 for enumerable___ in enumarables_more: if enumerable___["aria-label"] == "Pull Requests": pulls_tag = enumerable___ pulls_number = analyze_tag( pulls_tag.find("span", {"class": "counter"})) scream.say('Before parse number: ' + str(pulls_number)) result['pulls'] = parse_number(pulls_number) elif enumerable___["aria-label"] == "Issues": issues_tag = enumerable___ issues_number = analyze_tag( issues_tag.find("span", {"class": "counter"})) scream.say('Before parse number: ' + str(issues_number)) result['issues'] = parse_number(issues_number) result['status'] = 'OK' break except TypeError as ot: scream.say(str(ot)) scream.say( 'Scrambled results (TypeError). Maybe GitHub down. Retry') time.sleep(5.0) if force_raise: raise except Exception as e: scream.say(str(e)) scream.say('No response from selenium. Retry') time.sleep(2.0) if force_raise: raise assert 'status' in result return result
def developer_revealed(thread_getter_instance, repository, repo, contributor): global result_writer developer_login = contributor.login scream.log_debug( 'Assigning a contributor: ' + str(developer_login) + ' to a repo: ' + str(repository.name), True) developer_name = contributor.name # 1 Ilosc osob, ktore dany deweloper followuje [FollowEvent] developer_followers = contributor.followers # 2 Ilosc osob, ktore followuja dewelopera [FollowEvent] developer_following = contributor.following developer_location = contributor.location developer_total_private_repos = contributor.total_private_repos developer_total_public_repos = contributor.public_repos # 5. Ilosc repo, ktorych nie tworzyl, w ktorych jest team member [TeamAddEvent] [MemberEvent] developer_collaborators = contributor.collaborators # 6. Ilosc repo, ktorych nie tworzyl, w ktorych jest contributorem [PushEvent] [IssuesEvent] [PullRequestEvent] [GollumEvent] developer_contributions = contributor.contributions # - Ilosc projektow przez niego utworzonych his_repositories = contributor.get_repos() while True: total_his_repositories = 0 total_his_stars = 0 total_his_watchers = 0 total_his_forks = 0 total_his_has_issues = 0 total_his_has_wiki = 0 total_his_open_issues = 0 total_network_count = 0 total_his_collaborators = 0 total_his_contributors = 0 if count___ == 'selenium': total_his_commits = 0 total_his_branches = 0 total_his_releases = 0 total_his_issues = 0 total_his_pull_requests = 0 try: for his_repo in his_repositories: try: total_his_repositories += 1 total_his_forks += his_repo.forks_count total_his_stars += his_repo.stargazers_count total_his_watchers += his_repo.watchers_count total_his_has_issues += 1 if his_repo.has_issues else 0 total_his_has_wiki += 1 if his_repo.has_wiki else 0 total_his_open_issues += his_repo.open_issues total_network_count += his_repo.network_count if count___ == 'api': # 3 Ilosc deweloperow, ktorzy sa w projektach przez niego utworzonych [PushEvent] [IssuesEvent] [PullRequestEvent] [GollumEvent] total_his_contributors = None while True: try: total_his_contributors = 0 #total_his_contributors = his_repo.get_contributors().totalCount -- this is buggy and will make errors total_his_contributors += sum( 1 for temp_object in his_repo.get_contributors()) break except: freeze( 'Exception in getting total_his_contributors' ) if force_raise: raise assert total_his_contributors is not None # 4 Ilosc kontrybutorow, ktorzy sa w projektach przez niego utworzonych total_his_collaborators = None while True: try: total_his_collaborators = 0 #total_his_collaborators = his_repo.get_collaborators().totalCount -- this is buggy and will make errors total_his_collaborators += sum( 1 for temp_object in his_repo.get_collaborators()) break except: freeze( 'Exception in getting total_his_collaborators' ) if force_raise: raise assert total_his_collaborators is not None elif count___ == 'selenium': scream.say('Using selenium for thread about ' + str(developer_login) + ' \'s repositories') result = thread_getter_instance.analyze_with_selenium( his_repo ) # wyciagnij statystyki przez selenium, i zwroc w tablicy: # commits, branches, releases, contributors, issues, pull requests if result['status'] == '404': continue if result['status'] == 'EMPTY': continue total_his_commits += result['commits'] total_his_branches += result['branches'] total_his_releases += result['releases'] total_his_issues += result['issues'] total_his_pull_requests += result['pulls'] total_his_contributors += result['contributors'] else: # hence it is only when not selenium is used while True: try: his_contributors = set() stats = his_repo.get_stats_contributors() assert stats is not None for stat in stats: if str(stat.author.login).strip() in [ 'None', '' ]: continue his_contributors.add(stat.author.login) total_his_contributors += len(his_contributors) break except Exception as exc: scream.log_warning( 'Not ready data while revealing details.. ' + ', error({0})'.format(str(exc)), True) freeze( 'StatsContribution not ready.. waiting for the server to provide good data' ) if force_raise: raise except GithubException as e: freeze(str(e) + ' in try per repo of x-dev repos') if ("message" in e.data) and (e.data["message"].strip() == "Repository access blocked"): scream.log_debug("It is a private repo.. Skip!") continue if force_raise: raise break except Exception as e: freeze(str(e) + ' in main loop of developer_revealed()') his_repositories = contributor.get_repos() if force_raise: raise # Developer company (if any given) company = contributor.company created_at = contributor.created_at # Does the developer want to be hired? hireable = contributor.hireable scream.log_debug( 'Thread ' + str(thread_getter_instance.threadId) + ' Finished revealing contributor: ' + str(developer_login) + ' in a repo: ' + str(repository.name), True) if show_trace: scream.log_debug('Printing traceback stack', True) traceback.print_stack() scream.log_debug('Printing traceback exc pathway', True) traceback.print_exc() if not use_utf8: result_writer.writerow([ str(repo.getUrl()), str(repo.getName()), str(repo.getOwner()), str(repo.getStargazersCount()), str(repo.getWatchersCount()), str(developer_login), (str(developer_name) if developer_name is not None else ''), str(developer_followers), str(developer_following), str(developer_collaborators), (str(company) if company is not None else ''), str(developer_contributions), str(created_at), (str(hireable) if hireable is not None else ''), str(total_his_repositories), str(total_his_stars), str(total_his_collaborators), str(total_his_contributors), str(total_his_watchers), str(total_his_forks), str(total_his_has_issues), str(total_his_has_wiki), str(total_his_open_issues), str(total_network_count), (str(developer_location) if developer_location is not None else ''), str(developer_total_private_repos), str(developer_total_public_repos), str(total_his_issues), str(total_his_pull_requests) ]) else: result_writer.writerow([ repo.getUrl(), repo.getName(), repo.getOwner(), str(repo.getStargazersCount()), str(repo.getWatchersCount()), developer_login, (developer_name if developer_name is not None else ''), str(developer_followers), str(developer_following), str(developer_collaborators), (company if company is not None else ''), str(developer_contributions), str(created_at), (str(hireable) if hireable is not None else ''), str(total_his_repositories), str(total_his_stars), str(total_his_collaborators), str(total_his_contributors), str(total_his_watchers), str(total_his_forks), str(total_his_has_issues), str(total_his_has_wiki), str(total_his_open_issues), str(total_network_count), (developer_location if developer_location is not None else ''), str(developer_total_private_repos), str(developer_total_public_repos), str(total_his_issues), str(total_his_pull_requests) ]) scream.log_debug('Wrote row to CSV.', True)
def analyze_with_splinter(self, repository): result = dict() scream.say('Starting webinterpret for ' + repository.html_url + '..') assert repository is not None url = repository.html_url assert url is not None while True: try: try: self.splinter__browser.set_page_load_timeout(15) except: scream.say('') try: self.splinter__browser.ensure_success_response() except: scream.say('') self.splinter__browser.visit(url) scream.say('Data from web retrieved') if splinter__driver == 'firefox': doc = html.document_fromstring(unicode(self.splinter__browser.page_source)) elif splinter__driver == 'chrome': doc = html.document_fromstring(unicode(self.splinter__browser.html)) elif splinter__driver == 'phantomjs': doc = html.document_fromstring(unicode(self.splinter__browser.html)) elif splinter__driver == 'zope.testbrowser': #splinter__browser.set_handle_robots(False) doc = html.document_fromstring(unicode(self.splinter__browser.html.decode('utf-8'))) else: assert False # rest of browser not yet supported.. scream.log_debug(str(url), True) scream.say('Continue to work on ' + url) scream.say('Page source sent further') #splinter__browser.screenshot(name=repository.key, suffix='.png') scream.say('Verify if 404 (repo deleted) otherwise keep on going') parallax = doc.xpath('//div[@id="parallax_illustration"]') if (len(parallax) > 0): scream.say('Verified that 404 (repo deleted)') result['status'] = '404' break scream.say('Verified that not 404') scream.say('Verify if repo empty otherwise keep on going') repo_empty = doc.xpath('//div[@class="blankslate has-fixed-width"]') if (len(repo_empty) > 0): scream.say('Verified that repo is empty') result['status'] = 'EMPTY' break scream.say('Verified that repo not empty') if splinter__driver == 'phantomjs': #WebDriverWait(splinter__browser, 10).until(waiter) while True: scream.say("Wait for the AJAX to do the magic") if self.splinter__browser.is_element_not_present_by_xpath('//span[@class="octicon octicon-organization"]//..//..//text()[normalize-space(.)="Fetching contributors"]', wait_time=5): break else: scream.say("AJAX didnt work on time") doc = html.document_fromstring(unicode(self.splinter__browser.html)) assert "Fetching contributors" not in doc ns = doc.xpath('//ul[@class="numbers-summary"]') sunken = doc.xpath('//ul[@class="sunken-menu-group"]') scream.say('XPath made some search for ' + url + ' .. move on to bsoup..') scream.say('Xpath done searching') scream.say('Element found?: ' + str(len(ns) == 1)) element = ns[0] element_sunken = sunken[0] local_soup = BeautifulSoup(etree.tostring(element)) local_soup_sunken = BeautifulSoup(etree.tostring(element_sunken)) enumarables = local_soup.findAll("li") enumarables_more = local_soup_sunken.findAll("li") commits = enumarables[0] scream.say('enumarables[0]') commits_number = analyze_tag(commits.find("span", {"class": "num"})) scream.say('analyze_tag finished execution for commits_number') scream.say('Before parse number: ' + str(commits_number)) result['commits'] = parse_number(commits_number) scream.log_debug(result['commits'], True) scream.say('enumarables[1]') branches = enumarables[1] branches_number = analyze_tag(branches.find("span", {"class": "num"})) scream.say('Before parse number: ' + str(branches_number)) result['branches'] = parse_number(branches_number) scream.log_debug(result['branches'], True) scream.say('enumarables[2]') releases = enumarables[2] releases_number = analyze_tag(releases.find("span", {"class": "num"})) scream.say('Before parse number: ' + str(releases_number)) result['releases'] = parse_number(releases_number) scream.log_debug(result['releases'], True) scream.say('enumarables[3]') contributors = enumarables[3] contributors_number = analyze_tag(contributors.find("span", {"class": "num"})) scream.say('Before parse number: ' + str(contributors_number)) result['contributors'] = parse_number(contributors_number) scream.log_debug(result['contributors'], True) result['issues'] = 0 result['pulls'] = 0 for enumerable___ in enumarables_more: if enumerable___["aria-label"] == "Pull Requests": pulls_tag = enumerable___ pulls_number = analyze_tag(pulls_tag.find("span", {"class": "counter"})) scream.say('Before parse number: ' + str(pulls_number)) result['pulls'] = parse_number(pulls_number) elif enumerable___["aria-label"] == "Issues": issues_tag = enumerable___ issues_number = analyze_tag(issues_tag.find("span", {"class": "counter"})) scream.say('Before parse number: ' + str(issues_number)) result['issues'] = parse_number(issues_number) result['status'] = 'OK' break except TypeError as ot: scream.say(str(ot)) scream.say('Scrambled results (TypeError). Maybe GitHub down. Retry') time.sleep(5.0) if force_raise: raise except Exception as e: scream.say(str(e)) scream.say('No response from selenium. Retry') time.sleep(2.0) if force_raise: raise assert 'status' in result return result
def developer_revealed(thread_getter_instance, repository, repo, contributor): global result_writer global use_splinter developer_login = contributor.login scream.log_debug('Assigning a contributor: ' + str(developer_login) + ' to a repo: ' + str(repository.name), True) developer_name = contributor.name # 1 Ilosc osob, ktore dany deweloper followuje [FollowEvent] developer_followers = contributor.followers # 2 Ilosc osob, ktore followuja dewelopera [FollowEvent] developer_following = contributor.following developer_location = contributor.location developer_total_private_repos = contributor.total_private_repos developer_total_public_repos = contributor.public_repos # 5. Ilosc repo, ktorych nie tworzyl, w ktorych jest team member [TeamAddEvent] [MemberEvent] developer_collaborators = contributor.collaborators # 6. Ilosc repo, ktorych nie tworzyl, w ktorych jest contributorem [PushEvent] [IssuesEvent] [PullRequestEvent] [GollumEvent] developer_contributions = contributor.contributions # - Ilosc projektow przez niego utworzonych his_repositories = contributor.get_repos() while True: total_his_repositories = 0 total_his_stars = 0 total_his_watchers = 0 total_his_forks = 0 total_his_has_issues = 0 total_his_has_wiki = 0 total_his_open_issues = 0 total_network_count = 0 total_his_collaborators = 0 total_his_contributors = 0 if count___ == 'selenium': total_his_commits = 0 total_his_branches = 0 total_his_releases = 0 total_his_issues = 0 total_his_pull_requests = 0 try: for his_repo in his_repositories: try: total_his_repositories += 1 total_his_forks += his_repo.forks_count total_his_stars += his_repo.stargazers_count total_his_watchers += his_repo.watchers_count total_his_has_issues += 1 if his_repo.has_issues else 0 total_his_has_wiki += 1 if his_repo.has_wiki else 0 total_his_open_issues += his_repo.open_issues total_network_count += his_repo.network_count if count___ == 'api': # 3 Ilosc deweloperow, ktorzy sa w projektach przez niego utworzonych [PushEvent] [IssuesEvent] [PullRequestEvent] [GollumEvent] total_his_contributors = None while True: try: total_his_contributors = 0 #total_his_contributors = his_repo.get_contributors().totalCount -- this is buggy and will make errors total_his_contributors += sum(1 for temp_object in his_repo.get_contributors()) break except: freeze('Exception in getting total_his_contributors') if force_raise: raise assert total_his_contributors is not None # 4 Ilosc kontrybutorow, ktorzy sa w projektach przez niego utworzonych total_his_collaborators = None while True: try: total_his_collaborators = 0 #total_his_collaborators = his_repo.get_collaborators().totalCount -- this is buggy and will make errors total_his_collaborators += sum(1 for temp_object in his_repo.get_collaborators()) break except: freeze('Exception in getting total_his_collaborators') if force_raise: raise assert total_his_collaborators is not None elif count___ == 'selenium': scream.say('Using selenium for thread about ' + str(developer_login) + ' \'s repositories') if use_splinter: result = thread_getter_instance.analyze_with_splinter(his_repo) else: result = thread_getter_instance.analyze_with_selenium(his_repo) # wyciagnij statystyki przez selenium, i zwroc w tablicy: # commits, branches, releases, contributors, issues, pull requests if result['status'] == '404': continue if result['status'] == 'EMPTY': continue total_his_commits += result['commits'] total_his_branches += result['branches'] total_his_releases += result['releases'] total_his_issues += result['issues'] total_his_pull_requests += result['pulls'] total_his_contributors += result['contributors'] else: # hence it is only when not selenium is used while True: try: his_contributors = set() stats = his_repo.get_stats_contributors() assert stats is not None for stat in stats: if str(stat.author.login).strip() in ['None', '']: continue his_contributors.add(stat.author.login) total_his_contributors += len(his_contributors) break except Exception as exc: scream.log_warning('Not ready data while revealing details.. ' + ', error({0})'.format(str(exc)), True) freeze('StatsContribution not ready.. waiting for the server to provide good data') if force_raise: raise except GithubException as e: freeze(str(e) + ' in try per repo of x-dev repos') if ("message" in e.data) and (e.data["message"].strip() == "Repository access blocked"): scream.log_debug("It is a private repo.. Skip!") continue if force_raise: raise break except Exception as e: freeze(str(e) + ' in main loop of developer_revealed()') his_repositories = contributor.get_repos() if force_raise: raise # Developer company (if any given) company = contributor.company created_at = contributor.created_at # Does the developer want to be hired? hireable = contributor.hireable scream.log_debug('Thread ' + str(thread_getter_instance.threadId) + ' Finished revealing contributor: ' + str(developer_login) + ' in a repo: ' + str(repository.name), True) if show_trace: scream.log_debug('Printing traceback stack', True) traceback.print_stack() scream.log_debug('Printing traceback exc pathway', True) traceback.print_exc() if not use_utf8: result_writer.writerow([str(repo.getUrl()), str(repo.getName()), str(repo.getOwner()), str(repo.getStargazersCount()), str(repo.getWatchersCount()), str(developer_login), (str(developer_name) if developer_name is not None else ''), str(developer_followers), str(developer_following), str(developer_collaborators), (str(company) if company is not None else ''), str(developer_contributions), str(created_at), (str(hireable) if hireable is not None else ''), str(total_his_repositories), str(total_his_stars), str(total_his_collaborators), str(total_his_contributors), str(total_his_watchers), str(total_his_forks), str(total_his_has_issues), str(total_his_has_wiki), str(total_his_open_issues), str(total_network_count), (str(developer_location) if developer_location is not None else ''), str(developer_total_private_repos), str(developer_total_public_repos), str(total_his_issues), str(total_his_pull_requests)]) else: result_writer.writerow([repo.getUrl(), repo.getName(), repo.getOwner(), str(repo.getStargazersCount()), str(repo.getWatchersCount()), developer_login, (developer_name if developer_name is not None else ''), str(developer_followers), str(developer_following), str(developer_collaborators), (company if company is not None else ''), str(developer_contributions), str(created_at), (str(hireable) if hireable is not None else ''), str(total_his_repositories), str(total_his_stars), str(total_his_collaborators), str(total_his_contributors), str(total_his_watchers), str(total_his_forks), str(total_his_has_issues), str(total_his_has_wiki), str(total_his_open_issues), str(total_network_count), (developer_location if developer_location is not None else ''), str(developer_total_private_repos), str(developer_total_public_repos), str(total_his_issues), str(total_his_pull_requests)]) scream.log_debug('Wrote row to CSV.', True)
def developer_revealed(thread_getter_instance, repository, repo, contributor): global result_writer global result_punch_card_writer assert result_punch_card_writer is not None developer_login = contributor.login scream.log_debug('Assigning a contributor: ' + str(developer_login) + ' to a repo: ' + str(repository.name), True) developer_name = contributor.name # 1. Ilosc osob, ktore dany deweloper followuje [FollowEvent] developer_followers = contributor.followers # 2. Ilosc osob, ktore followuja dewelopera [FollowEvent] developer_following = contributor.following developer_location = contributor.location developer_total_private_repos = contributor.total_private_repos developer_total_public_repos = contributor.public_repos # 5. Ilosc repo, ktorych nie tworzyl, w ktorych jest team member [TeamAddEvent] [MemberEvent] developer_collaborators = contributor.collaborators # 6. Ilosc repo, ktorych nie tworzyl, w ktorych jest contributorem [PushEvent] [IssuesEvent] [PullRequestEvent] [GollumEvent] developer_contributions = contributor.contributions # his_repositories - Ilosc projektow przez niego utworzonych / ktorych jest wlascicielem # his_repositories = contributor.get_repos() # 17. Czy commituje w godzinach pracy (zaleznie od strefy czasowej)? scream.log_debug("Starting to analyze OSRC card for user: "******"Histogram for hours for user: " + str(developer_login) + ' created..', True) # count activity during business day count_bd__ = 0 count_bd__ += sum(time_of_activity_per_hours[i] for i in range(9, 18)) # now count activity during not-busines hours :) count_nwh__ = 0 count_nwh__ += sum(time_of_activity_per_hours[i] for i in range(0, 9)) count_nwh__ += sum(time_of_activity_per_hours[i] for i in range(18, 24)) developer_works_during_bd = True if count_bd__ >= count_nwh__ else False scream.log_debug('Running C program...', True) args___ = ['./hist_block'] + [str(x) for x in time_of_activity_per_hours] developer_works_period = subprocess.Popen(args___, stdout=subprocess.PIPE).stdout.read() # ----------------------------------------------------------------------- scream.log_debug('Finished analyze OSRC card for user: '******'OSRC gave error, probably 404') scream.say('try ' + str(tries) + ' more times') tries -= 1 finally: if tries < 1: developer_works_during_bd = 0 developer_works_period = 0 break # Developer company (if any given) company = contributor.company created_at = contributor.created_at # Does the developer want to be hired? hireable = contributor.hireable disk_usage = contributor.disk_usage public_gists = contributor.public_gists owned_private_repos = contributor.owned_private_repos total_private_repos = contributor.total_private_repos scream.log_debug('Thread ' + str(thread_getter_instance) + ' Finished revealing contributor: ' + str(developer_login) + ' in a repo: ' + str(repository.name), True) if show_trace: scream.log_debug('Printing traceback stack', True) traceback.print_stack() scream.log_debug('Printing traceback exc pathway', True) traceback.print_exc() if not use_utf8: result_writer.writerow([str(repo.getUrl()), str(repo.getName()), str(repo.getOwner()), str(repo.getStargazersCount()), str(repo.getWatchersCount()), str(repo.getCreatedAt()), str(repo.getDefaultBranch()), str(repo.getDescription()), str(repo.getIsFork()), str(repo.getForks()), str(repo.getForksCount()), str(repo.getHasDownloads()), str(repo.getHasWiki()), str(repo.getHasIssues()), str(repo.getLanguage()), str(repo.getMasterBranch()), str(repo.getNetworkCount()), str(repo.getOpenedIssues()), str(repo.getOrganization()), str(repo.getPushedAt()), str(repo.getUpdatedAt()), str(developer_login), str(developer_name if developer_name is not None else ''), str(developer_followers), str(developer_following), str(developer_collaborators), str(company if company is not None else ''), str(developer_contributions), str(created_at), str(hireable if hireable is not None else ''), str(developer_location if developer_location is not None else ''), str(developer_total_private_repos), str(developer_total_public_repos), str(developer_works_during_bd), str(developer_works_period), str(disk_usage), str(public_gists), str(owned_private_repos), str(total_private_repos)]) else: result_writer.writerow([repo.getUrl(), repo.getName(), repo.getOwner(), str(repo.getStargazersCount()), str(repo.getWatchersCount()), str(repo.getCreatedAt()), repo.getDefaultBranch(), repo.getDescription() if repo.getDescription() is not None else '', str(repo.getIsFork()), str(repo.getForks()), str(repo.getForksCount()), str(repo.getHasDownloads()), str(repo.getHasWiki()), str(repo.getHasIssues()), repo.getLanguage() if repo.getLanguage() is not None else '', repo.getMasterBranch() if repo.getMasterBranch() is not None else '', str(repo.getNetworkCount()), str(repo.getOpenedIssues()), repo.getOrganization() if repo.getOrganization() is not None else '', str(repo.getPushedAt()), str(repo.getUpdatedAt()), developer_login, developer_name if developer_name is not None else '', str(developer_followers), str(developer_following), str(developer_collaborators), company if company is not None else '', str(developer_contributions), str(created_at), str(hireable) if hireable is not None else '', developer_location if developer_location is not None else '', str(developer_total_private_repos), str(developer_total_public_repos), str(developer_works_during_bd), str(developer_works_period), str(disk_usage), str(public_gists), str(owned_private_repos), str(total_private_repos)]) scream.log_debug('Wrote row to CSV.', True)
continue else: resume_on_repo = None iteration_step_count += 1 if resume_on_repo_inclusive: scream.say('Not skipping the ' + str(resume_on_repo_name)) else: scream.say('Starting from the next from ' + str(resume_on_repo_name)) continue try: while True: if show_trace: scream.log_debug('Printing traceback stack', True) traceback.print_stack() scream.log_debug('Printing traceback exc pathway', True) traceback.print_exc() #scream.log_warning(inspect.getargvalues(sys.exc_info()[2].tb_frame)) scream.say( 'Creating Repository.py instance from API result..') scream.say('Working at the moment on repo: ' + str(repo.getKey())) current_ghc = github_clients[num_modulo(thread_id_count)] current_ghc_desc = github_clients_ids[num_modulo( thread_id_count)] repository = current_ghc.get_repo(repo.getKey()) scream.log_debug('Got a repository from API', True) repo.setRepoObject(repository)
with open('results.csv', 'wb') as csv_file: csv_writer = csv.writer(csv_file, delimiter=';', quotechar='\"', quoting=csv.QUOTE_ALL) with open('github-users-stats.json') as data_file: data = json.load(data_file) while(counter < limit): developer_login = data[counter]['login'] print "Starting to analyze OSRC card for user: " + str(developer_login) scream.progress_bar(counter, limit-1) tries = 5 while True: try: osrc_url = 'https://osrc.dfm.io/' + str(developer_login) + '.json' scream.log_debug('The osrc url is: ' + osrc_url, True) # OSRC was grumpy about the urllib2 even with headers attached # hdr = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7', # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', # 'Accept-Encoding': 'none', # 'Accept-Language': 'en-US,en;q=0.8', # 'Connection': 'keep-alive'} # req = urllib2.Request(osrc_url, headers=hdr) # response = urllib2.urlopen(req) # thus i moved to requests library proxy = {'http': '94.154.26.132:8090'} session_osrc = requests.Session() requests_osrc = session_osrc.get(osrc_url, proxies=proxy) # print requests_osrc.text osrc_data = json.loads(requests_osrc.text)
def developer_revealed(thread_getter_instance, repository, repo, contributor): global result_writer global result_punch_card_writer assert result_punch_card_writer is not None developer_login = contributor.login scream.log_debug( 'Assigning a contributor: ' + str(developer_login) + ' to a repo: ' + str(repository.name), True) developer_name = contributor.name # 1. Ilosc osob, ktore dany deweloper followuje [FollowEvent] developer_followers = contributor.followers # 2. Ilosc osob, ktore followuja dewelopera [FollowEvent] developer_following = contributor.following developer_location = contributor.location developer_total_private_repos = contributor.total_private_repos developer_total_public_repos = contributor.public_repos # 5. Ilosc repo, ktorych nie tworzyl, w ktorych jest team member [TeamAddEvent] [MemberEvent] developer_collaborators = contributor.collaborators # 6. Ilosc repo, ktorych nie tworzyl, w ktorych jest contributorem [PushEvent] [IssuesEvent] [PullRequestEvent] [GollumEvent] developer_contributions = contributor.contributions # his_repositories - Ilosc projektow przez niego utworzonych / ktorych jest wlascicielem # his_repositories = contributor.get_repos() # 17. Czy commituje w godzinach pracy (zaleznie od strefy czasowej)? scream.log_debug( "Starting to analyze OSRC card for user: "******"Histogram for hours for user: " + str(developer_login) + ' created..', True) # count activity during business day count_bd__ = 0 count_bd__ += sum(time_of_activity_per_hours[i] for i in range(9, 18)) # now count activity during not-busines hours :) count_nwh__ = 0 count_nwh__ += sum(time_of_activity_per_hours[i] for i in range(0, 9)) count_nwh__ += sum(time_of_activity_per_hours[i] for i in range(18, 24)) developer_works_during_bd = True if count_bd__ >= count_nwh__ else False scream.log_debug('Running C program...', True) args___ = ['./hist_block' ] + [str(x) for x in time_of_activity_per_hours] developer_works_period = subprocess.Popen( args___, stdout=subprocess.PIPE).stdout.read() # ----------------------------------------------------------------------- scream.log_debug( 'Finished analyze OSRC card for user: '******'OSRC gave error, probably 404') scream.say('try ' + str(tries) + ' more times') tries -= 1 finally: if tries < 1: developer_works_during_bd = 0 developer_works_period = 0 break # Developer company (if any given) company = contributor.company created_at = contributor.created_at # Does the developer want to be hired? hireable = contributor.hireable disk_usage = contributor.disk_usage public_gists = contributor.public_gists owned_private_repos = contributor.owned_private_repos total_private_repos = contributor.total_private_repos scream.log_debug( 'Thread ' + str(thread_getter_instance) + ' Finished revealing contributor: ' + str(developer_login) + ' in a repo: ' + str(repository.name), True) if show_trace: scream.log_debug('Printing traceback stack', True) traceback.print_stack() scream.log_debug('Printing traceback exc pathway', True) traceback.print_exc() if not use_utf8: result_writer.writerow([ str(repo.getUrl()), str(repo.getName()), str(repo.getOwner()), str(repo.getStargazersCount()), str(repo.getWatchersCount()), str(repo.getCreatedAt()), str(repo.getDefaultBranch()), str(repo.getDescription()), str(repo.getIsFork()), str(repo.getForks()), str(repo.getForksCount()), str(repo.getHasDownloads()), str(repo.getHasWiki()), str(repo.getHasIssues()), str(repo.getLanguage()), str(repo.getMasterBranch()), str(repo.getNetworkCount()), str(repo.getOpenedIssues()), str(repo.getOrganization()), str(repo.getPushedAt()), str(repo.getUpdatedAt()), str(developer_login), str(developer_name if developer_name is not None else ''), str(developer_followers), str(developer_following), str(developer_collaborators), str(company if company is not None else ''), str(developer_contributions), str(created_at), str(hireable if hireable is not None else ''), str(developer_location if developer_location is not None else ''), str(developer_total_private_repos), str(developer_total_public_repos), str(developer_works_during_bd), str(developer_works_period), str(disk_usage), str(public_gists), str(owned_private_repos), str(total_private_repos) ]) else: result_writer.writerow([ repo.getUrl(), repo.getName(), repo.getOwner(), str(repo.getStargazersCount()), str(repo.getWatchersCount()), str(repo.getCreatedAt()), repo.getDefaultBranch(), repo.getDescription() if repo.getDescription() is not None else '', str(repo.getIsFork()), str(repo.getForks()), str(repo.getForksCount()), str(repo.getHasDownloads()), str(repo.getHasWiki()), str(repo.getHasIssues()), repo.getLanguage() if repo.getLanguage() is not None else '', repo.getMasterBranch() if repo.getMasterBranch() is not None else '', str(repo.getNetworkCount()), str(repo.getOpenedIssues()), repo.getOrganization() if repo.getOrganization() is not None else '', str(repo.getPushedAt()), str(repo.getUpdatedAt()), developer_login, developer_name if developer_name is not None else '', str(developer_followers), str(developer_following), str(developer_collaborators), company if company is not None else '', str(developer_contributions), str(created_at), str(hireable) if hireable is not None else '', developer_location if developer_location is not None else '', str(developer_total_private_repos), str(developer_total_public_repos), str(developer_works_during_bd), str(developer_works_period), str(disk_usage), str(public_gists), str(owned_private_repos), str(total_private_repos) ]) scream.log_debug('Wrote row to CSV.', True)
def developer_revealed(thread_getter_instance, repository, repo, contributor): global result_writer global result_punch_card_writer assert result_punch_card_writer is not None developer_login = contributor.login scream.log_debug('Assigning a contributor: ' + str(developer_login) + ' to a repo: ' + str(repository.name), True) developer_name = contributor.name # 1. Ilosc osob, ktore dany deweloper followuje [FollowEvent] developer_followers = contributor.followers # 2. Ilosc osob, ktore followuja dewelopera [FollowEvent] developer_following = contributor.following developer_location = contributor.location developer_total_private_repos = contributor.total_private_repos developer_total_public_repos = contributor.public_repos # 5a. Ilosc repo, w ktorych jest team member [TeamAddEvent] [MemberEvent] developer_collaborators = contributor.collaborators scream.say('Developer collaborators count is: ' + str(developer_collaborators)) # 6a. Ilosc repo, w ktorych jest contributorem [PushEvent] [IssuesEvent] [PullRequestEvent] [GollumEvent] developer_contributions = contributor.contributions scream.say('Developer contributions count is: ' + str(developer_contributions)) # his_repositories - Ilosc projektow przez niego utworzonych / ktorych jest wlascicielem his_repositories = contributor.get_repos() # 17. Czy commituje w godzinach pracy (zaleznie od strefy czasowej)? scream.log_debug("Starting to analyze OSRC card for user: "******"Histogram for hours for user: "******"pod repozytorium" while True: try: trying_to_get_stats += 1 stats = his_repo.get_stats_contributors() status_code__ = get_status_code('https://api.github.com/repos/' + his_repo.full_name + '/stats/contributors') if status_code__ != 204: for s in stats: ad___c = 0 ad___a = 0 ad___d = 0 for w in s.weeks: ad___c += w.c ad___a += w.a ad___d += w.d if s.author.login not in his_contributors: his_contributors.add(s.author.login) result_punch_card_writer.writerow([str(his_repo.owner.login), str(his_repo.name), str(developer_login), str(s.author.login), str(s.total), str(ad___c), str(ad___a), str(ad___d)]) else: scream.log_debug('The subrepo is empty, thus no statistics (punchcard) generated this time', True) break except GithubException as e: freeze(str(e) + ' his_repo.get_stats_contributors(). Unexpected error with getting stats.') if ("message" in e.data) and (e.data["message"].strip() == "Repository access blocked"): scream.log_debug("It is a private repo.. Skip!", True) break if force_raise: raise except TypeError as e: scream.log_warning('This was stats attempt no: ' + str(trying_to_get_stats), True) freeze(str(e) + ' his_repo.get_stats_contributors(). Punch-card not ready?') # probably punch card not ready if force_raise: raise except Exception as e: freeze(str(e) + ' his_repo.get_stats_contributors(). Punch-card not ready?') # probably punch card not ready if force_raise: raise # 6. Ilosc repo, ktorych nie tworzyl, w ktorych jest deweloperem if developer_login in his_contributors: self_contributing += 1 # 5. Ilosc repo, ktorych nie tworzyl, w ktorych jest team member subrepo_collaborators = his_repo.get_collaborators() for collaborator in subrepo_collaborators: total_his_collaborators += 1 if developer_login == collaborator.login: self_collaborating += 1 # All elements paginated through his_repositories, thus we can safely break loop break except GithubException as e: freeze('While getting subrepo details, ' + str(e) + ' in element his_repo in his_repositories') if ("message" in e.data) and (e.data["message"].strip() == "Repository access blocked"): scream.log_debug("It is a private repo.. Skip!") continue if force_raise: raise except TypeError as e: freeze('While getting subrepo details, ' + str(e) + ' in element his_repo in his_repositories. Quota depleted?') # probably punch card not ready if force_raise: raise except Exception as e: freeze('While getting subrepo details, ' + str(e) + ' in element his_repo in his_repositories. Quota depleted?') # probably punch card not ready if force_raise: raise total_his_contributors = len(his_contributors) # 5. Ilosc repo, ktorych nie tworzyl, w ktorych jest team member [TeamAddEvent] [MemberEvent] # tutaj od wartosci developer_collaborators wystarczy odjac wystapienia loginu w podrepo.get_collaborators() developer_foreign_collaborators = (developer_collaborators if developer_collaborators is not None else 0) - self_collaborating # 6. Ilosc repo, ktorych nie tworzyl, w ktorych jest contributorem [PushEvent] [IssuesEvent] [PullRequestEvent] [GollumEvent] # tutaj od wartosci developer_contributions wystarczy odjac wystapienia loginu w podrepo.get_contributions() developer_foreign_contributions = developer_contributions - self_contributing # All properties checked for a dev, thus we can safely break loop break except Exception as e: freeze('Error ' + str(e) + ' in for his_repo in his_repositories loop. Will start the subrepo analysis from the beginning.') his_repositories = contributor.get_repos() if force_raise: raise # Developer company (if any given) company = contributor.company created_at = contributor.created_at # Does the developer want to be hired? hireable = contributor.hireable disk_usage = contributor.disk_usage public_gists = contributor.public_gists owned_private_repos = contributor.owned_private_repos total_private_repos = contributor.total_private_repos scream.log_debug('Thread ' + str(thread_getter_instance) + ' Finished revealing contributor: ' + str(developer_login) + ' in a repo: ' + str(repository.name), True) if show_trace: scream.log_debug('Printing traceback stack', True) traceback.print_stack() scream.log_debug('Printing traceback exc pathway', True) traceback.print_exc() if not use_utf8: result_writer.writerow([str(repo.getUrl()), str(repo.getName()), str(repo.getOwner()), str(repo.getStargazersCount()), str(repo.getWatchersCount()), str(repo.getCreatedAt()), str(repo.getDefaultBranch()), str(repo.getDescription()), str(repo.getIsFork()), str(repo.getForks()), str(repo.getForksCount()), str(repo.getHasDownloads()), str(repo.getHasWiki()), str(repo.getHasIssues()), str(repo.getLanguage()), str(repo.getMasterBranch()), str(repo.getNetworkCount()), str(repo.getOpenedIssues()), str(repo.getOrganization()), str(repo.getPushedAt()), str(repo.getUpdatedAt()), str(repo.getPullsCount()), str(total_his_contributors), str(total_his_collaborators), str(developer_foreign_collaborators), str(developer_foreign_contributions), str(total_his_issues), str(total_his_pull_requests), str(developer_login), str(developer_name if developer_name is not None else ''), str(developer_followers), str(developer_following), str(developer_collaborators), str(company if company is not None else ''), str(developer_contributions), str(created_at), str(hireable if hireable is not None else ''), str(developer_location if developer_location is not None else ''), str(developer_total_private_repos), str(developer_total_public_repos), str(developer_works_during_bd), str(developer_works_period), str(disk_usage), str(public_gists), str(owned_private_repos), str(total_private_repos)]) else: result_writer.writerow([repo.getUrl(), repo.getName(), repo.getOwner(), str(repo.getStargazersCount()), str(repo.getWatchersCount()), str(repo.getCreatedAt()), repo.getDefaultBranch(), repo.getDescription() if repo.getDescription() is not None else '', str(repo.getIsFork()), str(repo.getForks()), str(repo.getForksCount()), str(repo.getHasDownloads()), str(repo.getHasWiki()), str(repo.getHasIssues()), repo.getLanguage() if repo.getLanguage() is not None else '', repo.getMasterBranch() if repo.getMasterBranch() is not None else '', str(repo.getNetworkCount()), str(repo.getOpenedIssues()), repo.getOrganization() if repo.getOrganization() is not None else '', str(repo.getPushedAt()), str(repo.getUpdatedAt()), str(repo.getPullsCount()), str(total_his_contributors), str(total_his_collaborators), str(developer_foreign_collaborators), str(developer_foreign_contributions), str(total_his_issues), str(total_his_pull_requests), developer_login, developer_name if developer_name is not None else '', str(developer_followers), str(developer_following), str(developer_collaborators), company if company is not None else '', str(developer_contributions), str(created_at), str(hireable) if hireable is not None else '', developer_location if developer_location is not None else '', str(developer_total_private_repos), str(developer_total_public_repos), str(developer_works_during_bd), str(developer_works_period), str(disk_usage), str(public_gists), str(owned_private_repos), str(total_private_repos)]) scream.log_debug('Wrote row to CSV.', True)