def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: directory = base_path + "/" + login files = os.listdir(directory) for file in files: file_path = directory + "/" + file text = base.get_info_from_file(file_path) if text is False: logging.warn("file not existed: " + file_path) else: obj = json.loads(text) logging.info("read file: " + file_path) count = 1 for edge in obj["data"]["user"][ "sponsorshipsAsMaintainer"]["edges"]: if edge["node"]["privacyLevel"] == "PRIVATE": cur.execute( "insert into github_sponsorships_as_maintainer " "(login, flag, created_at) " "values (%s, %s, %s)", (obj["data"]["user"]["login"], base.flag2, base.time_handler( edge["node"]["createdAt"]))) else: if "company" in edge["node"]["sponsorEntity"]: flag = base.flag0 else: flag = base.flag1 cur.execute( "insert into github_sponsorships_as_maintainer " "(login, sponsor_login, flag, created_at) " "values (%s, %s, %s, %s)", (obj["data"]["user"]["login"], edge["node"]["sponsorEntity"]["login"], flag, base.time_handler( edge["node"]["createdAt"]))) db.commit() logging.info("the " + str(count) + "th record in file: " + file_path) count += 1 self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal(e)
def run(self): while not self.q.empty(): work = self.q.get() logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file file = base_path + "/" + login + ".json" text = base.get_info_from_file(file) if text is False: logging.warn("file not existed: " + file) else: obj = json.loads(text) logging.info("writing login data: " + login) if obj["data"]["user"]["hasSponsorsListing"] is True: cur.execute( "insert into github_user " "(database_id, login, name, email,spon_maintainer_count," " spon_sponsor_count, created_at, updated_at, has_sponsors_listing) " "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)", (obj["data"]["user"]["databaseId"], obj["data"]["user"]["login"], obj["data"]["user"]["name"], obj["data"]["user"]["email"], obj["data"]["user"] ["sponsorshipsAsMaintainer"]["totalCount"], obj["data"]["user"]["sponsorshipsAsSponsor"] ["totalCount"], base.time_handler(obj["data"]["user"]["createdAt"]), base.time_handler( obj["data"]["user"]["updatedAt"]), "1")) else: cur.execute( "insert into github_user " "(database_id, login, name, email,spon_maintainer_count," " spon_sponsor_count, created_at, updated_at, has_sponsors_listing) " "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)", (obj["data"]["user"]["databaseId"], obj["data"]["user"]["login"], obj["data"]["user"]["name"], obj["data"]["user"]["email"], obj["data"]["user"] ["sponsorshipsAsMaintainer"]["totalCount"], obj["data"]["user"]["sponsorshipsAsSponsor"] ["totalCount"], base.time_handler(obj["data"]["user"]["createdAt"]), base.time_handler( obj["data"]["user"]["updatedAt"]), "0")) db.commit() logging.info(login + " ~~~~~~~~~ data commit into dababase success!!") self.q.task_done() cur.close() db.close()
def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: file = base_path + "/" + login + ".json" text = base.get_info_from_file(file) if text is False: logging.warn("file not existed: " + file) else: obj = json.loads(text) if obj["data"]["user"]["sponsorsListing"] is not None: logging.info(login + " ~~~~~~~~~ has " + str(obj["data"]["user"]["sponsorsListing"] ["tiers"]["totalCount"]) + " tiers") count = 1 for edge in obj["data"]["user"]["sponsorsListing"][ "tiers"]["edges"]: cur.execute( "insert into github_sponsor_listing_tiers " "(login, slug, monthly_price_in_cents, monthly_price_in_dollars, name, created_at, updated_at, description) " "values (%s, %s, %s, %s, %s, %s, %s, %s)", (obj["data"]["user"]["login"], obj["data"] ["user"]["sponsorsListing"]["slug"], edge["node"]["monthlyPriceInCents"], edge["node"]["monthlyPriceInDollars"], edge["node"]["name"], base.time_handler(edge["node"]["createdAt"]), base.time_handler(edge["node"]["updatedAt"]), edge["node"]["description"])) db.commit() # logging.info("the " + str(count) + "th tier data commit into dababase success!!") count += 1 else: logging.warn("login: "******" don't have sponsor_listing") logging.warn("sponsor_listing: " + str(obj)) self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal(e) logging.error(e)
def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: directory = base_path + "/" + login files = base.read_all_filename_in_directory(directory) for file in files: text = base.get_info_from_file(file) if text is False: logging.warn("file not existed: " + file) else: obj = json.loads(text) logging.info("read file: " + file) count = 1 if "edges" not in obj["data"]["user"][ "commitComments"]: continue for node in obj["data"]["user"]["commitComments"][ "edges"]: logging.info("the " + str(count) + "th record in file: " + file) if node["node"]["commit"] is not None: oid = node["node"]["commit"]["oid"] else: oid = "" cur.execute( "insert into github_commit_comment " "(comm_database_id, login, created_at, updated_at, body, commit_oid) " "values (%s, %s, %s, %s, %s, %s)", (node["node"]["databaseId"], obj["data"]["user"]["login"], base.time_handler(node["node"]["createdAt"]), base.time_handler(node["node"]["updatedAt"]), node["node"]["body"], oid)) db.commit() count += 1 self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal(e) return
def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: file = base_path + "/" + login + ".json" text = base.get_info_from_file(file) if text is False: logging.warn("file not existed: " + file) else: obj = json.loads(text) if obj["data"]["user"]["sponsorsListing"] is None: logging.info("user: "******" don't create sponsors") else: cur.execute( "SELECT * FROM github_sponsor_listing WHERE login='******'") items = cur.fetchall() if len(items) == 1: logging.info("user: "******" had been inserted into database!") else: cur.execute( "insert into github_sponsor_listing " "(login, slug, name, tiers_total_count, created_at, short_description) " "values (%s, %s, %s, %s, %s, %s)", (obj["data"]["user"]["login"], obj["data"] ["user"]["sponsorsListing"]["slug"], obj["data"]["user"]["sponsorsListing"] ["name"], obj["data"]["user"] ["sponsorsListing"]["tiers"]["totalCount"], base.time_handler( obj["data"]["user"]["sponsorsListing"] ["createdAt"]), obj["data"]["user"] ["sponsorsListing"]["shortDescription"])) db.commit() logging.info( login + " ~~~~~~~~~ data commit into dababase success!!" ) self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal(e) return
def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: directory = base_path + "/" + login files = base.read_all_filename_in_directory(directory) for file in files: text = base.get_info_from_file(file) if text is False: logging.warn("file not existed: " + file) else: obj = json.loads(text) logging.info("read file: " + file) count = 1 for node in obj["data"]["user"][ "contributionsCollection"][ "pullRequestReviewContributions"]["edges"]: try: # maybe happen duplicate key when insert data cur.execute( "insert ignore into github_user_pr_review " "(pr_database_id, login, created_at, body) " "values (%s, %s, %s, %s)", (node["node"]["pullRequestReview"] ["databaseId"], node["node"] ["pullRequestReview"]["author"]["login"], base.time_handler( node["node"]["pullRequestReview"] ["createdAt"]), node["node"] ["pullRequestReview"]["body"])) db.commit() # logging.info("the " + str(count) + "th record in file: " + file) except Exception as e: logging.error(e) count += 1 self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal(e) return
def run(self): # get db connection db = connectMysqlDB(config, autocommit=False) cur = db.cursor() while True: try: print "" work = self.q.get(timeout=0) print "the number of work in queue: " + str(self.q.qsize()) number = work["number"] owner = work["owner"] repo = work["repo"] page = 1 sum = 0 # sum of inserted db # every comments has several pages of reaction while True: # get a suitable token and combine header github_token = get_token() headers = { 'User-Agent': 'Mozilla/5.0', 'Authorization': 'token ' + github_token, 'Content-Type': 'application/json', 'method': 'GET', 'Accept': 'application/vnd.github.squirrel-girl-preview+json' } # print "headers is: " + str(headers) # combine url url = "https://api.github.com/repos/" + owner + "/" + repo + "/issues" + "/" + str( number) + "/reactions" url = url + "?page=" + str(page) print "url is: " + url try: # request data and parse response req = urllib2.Request(url=url, headers=headers) response = urllib2.urlopen(req) result = json.loads(response.read().decode("utf-8")) # print result length = len(result) sum += length if length == 0: print "finish, comment " + str( number) + " has reactions: " + str(sum) self.q.task_done() break # write file json_str = json.dumps(result) # print "json format data: " + json_str filename = base_path + "/" + owner + "&" + repo + "/" + str( number) + "/" + str(page) + ".json" flag = base.generate_file(filename, json_str) if flag is True: print "create file successfully: " + filename elif flag is False: print "file is already existed: " + filename else: print "create file failed: " + flag + " filename: " + filename continue page += 1 # page++ # handle response json data num = 0 while num < length: insert_dict = {} if "id" not in result[num]: insert_dict["id"] = None else: insert_dict["id"] = result[num]["id"] if "login" not in result[num]["user"]: insert_dict["user_login"] = None else: insert_dict["user_login"] = result[num][ "user"]["login"] if "created_at" not in result[num]: insert_dict["created_at"] = None else: insert_dict["created_at"] = result[num][ "created_at"] if "content" not in result[num]: insert_dict["content"] = None else: insert_dict["content"] = result[num]["content"] print "insert info: " + str(insert_dict) # insert data to database table try: if insert_dict is not None: cur.execute( "insert into github_reaction " "(id, number, user_login, owner_login, repo, created_at, flag, content) " "values (%s, %s, %s, %s, %s, %s, %s, %s)", (insert_dict["id"], number, insert_dict["user_login"], owner, repo, base.time_handler( insert_dict["created_at"]), 1, insert_dict["content"])) db.commit() except Exception as e: print str(e) num += 1 except Exception as e: print str(e) + " error with this page: " + url # if e.code == 403: # break # if e.code != 404: # # mainly 403, sometimes 503 # # token rate limit # self.q.put(work) # put into the queue again # sleep_time_tokens[github_token] = time.time() # set sleep time for that token # insert_dict = None # else: # insert_dict["body"] = "404 error" # insert_dict["created_at"] = None # insert_dict["updated_at"] = None else: pass # 403... error except Queue.Empty: cur.close() db.close() return except Exception as e: print str( e ) + "qiubing" # unexpected error, don't interrupt the program
def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: directory = base_path + "/" + login files = os.listdir(directory) for file in files: file_path = directory + "/" + file text = base.get_info_from_file(file_path) if text is False: logging.warn("file not existed: " + file_path) continue obj = json.loads(text) print "read file: " + file_path count = 1 # github user 接受了打赏,但是没有打赏过别人。 # 之所以将这部分数据写入 github_sponsorships_as_sponsor 表中,是为了做筛选 if len(obj["data"]["user"]["sponsorshipsAsSponsor"] ["edges"]) == 0: logging.warn("the user " + login + " doesn't sponsor others") cur.execute( "insert into github_sponsorships_as_sponsor " "(login, sponsor_login, flag) " "values (%s, %s, %s)", (login, login, str(base.flag4))) db.commit() continue for edge in obj["data"]["user"]["sponsorshipsAsSponsor"][ "edges"]: if edge["node"]["privacyLevel"] == "PRIVATE": logging.info("the " + str(count) + "th record is private in file: " + file_path) count += 1 continue else: slug = edge["node"]["sponsorable"][ "sponsorsListing"]["slug"].split("-")[1] cur.execute( "insert into github_sponsorships_as_sponsor " "(login, slug, sponsor_login, flag, created_at) " "values (%s, %s, %s, %s, %s)", (slug, edge["node"]["sponsorable"] ["sponsorsListing"]["slug"], obj["data"]["user"]["login"], str(3), base.time_handler(edge["node"]["createdAt"]))) db.commit() logging.info("the " + str(count) + "th record in file: " + file_path) count += 1 self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal(e)
def run(self): work = self.q.get(timeout=0) print "the number of work in queue: " + str(self.q.qsize()) id = work["repo_id"] owner = work["owner"] repo = work["repo"] page = 1 sum = 0 # sum of inserted db # get db connection db = connectMysqlDB(config, autocommit=False) cur = db.cursor() while True: print "" try: # get a suitable token and combine header github_token = get_token() headers = { 'User-Agent': 'Mozilla/5.0', 'Authorization': 'token ' + github_token, 'Content-Type': 'application/json', 'method': 'GET', 'Accept': 'application/vnd.github.squirrel-girl-preview+json' } # print "headers is: " + str(headers) # combine url, notice: per page is 30 url = "https://api.github.com/repos/" + owner + "/" + repo + "/issues" url = url + "?state=all" + "&page=" + str( page) + "&per_page=30" print "url is: " + url insert_dict = {} try: # request data and parse response req = urllib2.Request(url=url, headers=headers) response = urllib2.urlopen(req) result = json.loads(response.read().decode("utf-8")) # print result # judge response info empty length = len(result) sum += length if length == 0: # close the db connection cur.close() db.close() print "finish & the sum of issue of pull request is: " + str( sum) self.q.task_done() return # write file json_str = json.dumps(result) # print "json format data: " + json_str filename = base_path + "/" + owner + "&" + repo + "&" + str( id) + "/" + str(page) + ".json" flag = base.generate_file(filename, json_str) if flag is True: print "create file successfully: " + filename elif flag is False: print "file is already existed: " + filename else: print "create file failed: " + flag + " filename: " + filename continue page += 1 # page++ # handle response json data num = 0 while num < length: insert_dict = {} if "id" not in result[num]: insert_dict["id"] = None else: insert_dict["id"] = result[num]["id"] if "number" not in result[num]: insert_dict["number"] = None else: insert_dict["number"] = result[num]["number"] if "comments" not in result[num]: insert_dict["comments"] = None else: insert_dict["comments"] = result[num]["comments"] if "created_at" not in result[num]: insert_dict["created_at"] = None else: insert_dict["created_at"] = result[num][ "created_at"] if "updated_at" not in result[num]: insert_dict["updated_at"] = None else: insert_dict["updated_at"] = result[num][ "updated_at"] if "login" not in result[num]["user"]: insert_dict["user_login"] = None else: insert_dict["user_login"] = result[num]["user"][ "login"] if "heart" not in result[num]["reactions"]: insert_dict["heart"] = None else: insert_dict["heart"] = result[num]["reactions"][ "heart"] if "eyes" not in result[num]["reactions"]: insert_dict["eyes"] = None else: insert_dict["eyes"] = result[num]["reactions"][ "eyes"] if "rocket" not in result[num]["reactions"]: insert_dict["rocket"] = None else: insert_dict["rocket"] = result[num]["reactions"][ "rocket"] if "total_count" not in result[num]["reactions"]: insert_dict["total_count"] = None else: insert_dict["total_count"] = result[num][ "reactions"]["total_count"] if "confused" not in result[num]["reactions"]: insert_dict["confused"] = None else: insert_dict["confused"] = result[num]["reactions"][ "confused"] if "hooray" not in result[num]["reactions"]: insert_dict["hooray"] = None else: insert_dict["hooray"] = result[num]["reactions"][ "hooray"] if "+1" not in result[num]["reactions"]: insert_dict["up"] = None else: insert_dict["up"] = result[num]["reactions"]["+1"] if "laugh" not in result[num]["reactions"]: insert_dict["laugh"] = None else: insert_dict["laugh"] = result[num]["reactions"][ "laugh"] if "-1" not in result[num]["reactions"]: insert_dict["down"] = None else: insert_dict["down"] = result[num]["reactions"][ "-1"] # print "insert num: " + str(insert_dict) # 0 represent issue, 1 represent pull request if "pull_request" not in result[num]: flag = 0 else: flag = 1 print "the issue type: " + str(flag) # insert data to database table try: if insert_dict is not None: cur.execute( "insert into github_issue " "(id, number, user_login, owner_login, repo, created_at, updated_at, flag, comments, total_count, up, down, laugh, confused, heart, hooray, rocket, eyes) " "values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (insert_dict["id"], insert_dict["number"], insert_dict["user_login"], owner, repo, base.time_handler( insert_dict["created_at"]), base.time_handler( insert_dict["updated_at"]), flag, insert_dict["comments"], insert_dict["total_count"], insert_dict["up"], insert_dict["down"], insert_dict["laugh"], insert_dict["confused"], insert_dict["heart"], insert_dict["hooray"], insert_dict["rocket"], insert_dict["eyes"])) db.commit() except Exception as e: print str(e) num += 1 except urllib2.HTTPError as e: print str(e.code) + " error with this page: " + url if e.code != 404: # mainly 403, sometimes 503 # token rate limit self.q.put(work) # put into the queue again sleep_time_tokens[github_token] = time.time( ) # set sleep time for that token insert_dict = None else: insert_dict["body"] = "404 error" insert_dict["created_at"] = None insert_dict["updated_at"] = None else: pass # 403... error except Exception as e: print str(e) # unexpected error, don't interrupt the program