def so_clouds(self, year): mask = np.array( Image.open( "/home/iraklis/Desktop/StackOverflow/Media/Images/WordClouds/proper_circle.png" )) anno_coms = tools.load_pickle(self.path + "Communities0/" + str(year) + "_annotated_communites.pickle") tags = tools.load_pickle(self.path + "Communities0/" + str(year) + "_top_tags_per_community.pickle") input_dict = self.convert_to_list_of_dict(tags) list_of_colors = self.choose_colors(anno_coms) for idx, com in enumerate(anno_coms): wc = WordCloud(background_color=list_of_colors[idx], mask=mask, max_words=60, prefer_horizontal=1, contour_width=0.1, collocations=False, margin=1, width=660, height=660, color_func=lambda *args, **kwargs: (0, 0, 0)) wc.generate_from_frequencies(input_dict[idx]) wc.to_file(self.path + "Word_Clouds/" + str(year) + "/" + com.replace('/', '') + ".png")
def overall(self): # Calculating the number of users/nodes of each community com_sizes = dict() percentage_list = list() for idx, c_date in enumerate(self.all_dates[:-1]): prev_nodes = set() next_nodes = set() com_nodes_1 = tools.load_pickle(self.path + c_date + "_infomap_coms.pickle") com_names_1 = tools.load_pickle(self.path + c_date + "_annotated_communites.pickle") com_nodes_2 = tools.load_pickle(self.path + self.all_dates[idx + 1] + "_infomap_coms.pickle") com_names_2 = tools.load_pickle(self.path + self.all_dates[idx + 1] + "_annotated_communites.pickle") for node_list in com_nodes_1: for node in node_list: prev_nodes.add(node) for node_list in com_nodes_2: for node in node_list: next_nodes.add(node) new_users = next_nodes - prev_nodes percentage_list.append(len(new_users) / len(next_nodes)) print(np.mean(percentage_list))
def community_timeseries(self, com_category, format): # date for community based time series s_date_obj = parse("2008-01-01T00:00:00.000") e_date_obj = parse("2021-01-01T00:00:00.000") m_delta = relativedelta(years=1) date_strings = list() all_coms = set() community_timeseries_dict = dict() current_date_obj = s_date_obj while current_date_obj < e_date_obj: date_strings.append(current_date_obj) current_date_obj += m_delta m_dates = matplotlib.dates.date2num(date_strings) # gathering community names for c_year in self.all_dates: annotated = tools.load_pickle(self.path + c_year + "_annotated_communites.pickle") for com in annotated: all_coms.add(com) for com in all_coms: community_timeseries_dict[com] = [0] * 13 for idx, year in enumerate(self.all_dates): total_nodes = 0 com_nodes = tools.load_pickle(self.path + year + "_infomap_coms.pickle") for com in com_nodes: total_nodes += len(com) annotated = tools.load_pickle(self.path + year + "_annotated_communites.pickle") for com, com_name in zip(com_nodes, annotated): community_timeseries_dict[com_name][idx] = len( com) / total_nodes for com in self.display_coms[com_category]: plt.plot_date(m_dates, community_timeseries_dict[com], 'b-', color=self.color_dict[com], label=com) plt.xlabel('Date') plt.ylabel('Percentage of Users') plt.title("Community Sizes per Year") plt.xticks(rotation=45) # defining the limits of an axes axes = plt.gca() axes.set_ylim([0, 0.41]) axes.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) plt.legend() plt.savefig(self.path + "p_usrs_" + com_category + "." + format, bbox_inches='tight', format=format, dpi=300)
def create_relations(self, year): start_time = time.time() with open( self.files_path + "Graphs/Relations/" + str(year) + "_users_relations.csv", "w") as rel_file: rel_file.write("Source,Target,Weight\n") user_tag_dict = tools.load_pickle(self.files_path + "year_scores/" + str(year)) normalized_user_tag_dict = dict() for user_id, tag_list in user_tag_dict.items(): normalized_user_tag_dict[user_id] = [ tag_list[0], self.normalize_tag_score(copy.deepcopy(tag_list[1])), set(tag_list[1].keys()) ] all_users_ids = list(normalized_user_tag_dict.keys()) print(len(all_users_ids)) for idx, outer_user_id in enumerate(all_users_ids[:-1]): print(idx) for inner_user_id in all_users_ids[idx + 1:]: usr_distance2 = self.users_distance2( normalized_user_tag_dict[outer_user_id], normalized_user_tag_dict[inner_user_id]) rel_file.write( str(outer_user_id) + ',' + str(inner_user_id) + ',' + str(2 - usr_distance2) + "\n") tools.save_pickle(self.files_path + "year_scores/normalized_2020", normalized_user_tag_dict) print("execution time", time.time() - start_time)
def gather_all_names(self): for year in range(2008, 2021): annotated_coms = tools.load_pickle(self.path + "Communities0/" + str(year) + "_annotated_communites.pickle") for com in annotated_coms: self.all_com_names.add(com) print()
def reform_post_lists(self): print("In reform post lists.") for filepath in glob.iglob(self.data_path + "Posts/posts_per_month/*"): post_dict = dict() file_name = filepath.split("/")[-1] print(file_name) post_list = tools.load_pickle(filepath) for post in post_list: post_dict[int(post[3])] = (post[0], post[1], post[2]) tools.save_pickle(self.data_path + "pivot_files/reformed_posts/" + file_name, post_dict) print("reform post lists done.")
def process_scores(self): month_delta = relativedelta(months=1) current_date_obj = self.start_date_obj while current_date_obj < self.end_date_obj: date_string = str(current_date_obj.year) + "-" + str( current_date_obj.month) print(date_string) score_dict = tools.load_pickle(self.score_path + date_string) self.month_tag_scores[date_string] = self.slot_score(score_dict) current_date_obj += month_delta tools.save_pickle(self.score_path + "month_tag_scores", self.month_tag_scores)
def create_date_indexes(self): print("In create_date_indexes.") current_date_obj = self.start_date_obj month_delta = relativedelta(months=1) while current_date_obj < self.end_date_obj: date_string = str(current_date_obj.year) + "-" + str(current_date_obj.month) print(date_string) post_dict = tools.load_pickle(self.data_path + "Posts/posts_per_month/" + date_string) month_set = set() for post in post_dict: month_set.add(post[3]) current_date_obj += month_delta tools.save_pickle(self.data_path + "pivot_files/date_to_postid/" + date_string, month_set) print("create_date_indexes done.")
def create_date_index(self): set_list = list() start_date_obj = parse("2008-08-01T00:00:00.000") end_date_obj = parse("2020-12-31T00:00:00.000") month_delta = relativedelta(months=1) while start_date_obj < end_date_obj: date_string = str(start_date_obj.year) + "-" + str(start_date_obj.month) post_dict = tools.load_pickle(self.data_path + "Posts/posts_per_month/" + date_string) month_set = set() for post in post_dict: month_set.add(post[3]) set_list.append((date_string, month_set)) start_date_obj += month_delta print("Date index created.") print(len(set_list)) return set_list
def timeseries_plots(self): # tag_string = ["android", "ios", "windows", "linux", "unix"] # tag_string = ["c++", "c", "python", "java", "r", "ruby", "javascript", "php", "c#"] tag_string = [ "reactjs", "ruby-on-rails", "asp.net", "angular", "angularjs", "django", "vue.js", "laravel", "spring", "flask" ] date_strings = list() tag_dict = tools.load_pickle( "/home/iraklis/PycharmProjects/SO_New/SRC3/Revisions/I_O/Scores/" "tag_timeseries.pickle") default_colors = [ 'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan', 'fuchsia', 'black' ] # Create a list of the date strings current_date_obj = self.start_date_obj month_delta = relativedelta(months=1) while current_date_obj < self.end_date_obj: date_strings.append(current_date_obj) current_date_obj += month_delta m_dates = matplotlib.dates.date2num(date_strings) for idx, tag in enumerate(tag_string): plt.plot_date(m_dates, tag_dict[tag], 'b-', color=default_colors[idx], label=tag) plt.xlabel('Date') plt.ylabel('Vote Score') plt.title("Web Frameworks") # plt.title("Programming Languages") # plt.title("Operating Systems") plt.legend() plt.xticks(rotation=45) plt.savefig(self.path + "/web_frameworks.png", bbox_inches='tight', format="png", dpi=300)
def merge_months(self, year, start_month, end_month): year_dict = dict() for month in range(start_month, end_month): month_dict_of_tags = tools.load_pickle(self.files_path + "Scores/" + str(year) + "-" + str(month)) for user_id, tag_list in month_dict_of_tags.items(): if user_id in year_dict: year_dict[user_id][0] += month_dict_of_tags[user_id][0] for tag, tag_score in month_dict_of_tags[user_id][1].items( ): if tag in year_dict[user_id][1]: year_dict[user_id][1][tag] += tag_score else: year_dict[user_id][1][tag] = tag_score else: year_dict[user_id] = tag_list tools.save_pickle(self.files_path + "year_scores/" + str(year), year_dict)
def reform_votes(self): print("In reform votes.") current_date_obj = self.start_date_obj month_delta = relativedelta(months=1) while current_date_obj < self.end_date_obj: vote_date = str(current_date_obj.year) + "-" + str(current_date_obj.month) c_month_votes = tools.load_pickle(self.data_path + "Votes/votes_per_month/" + vote_date) reformed_votes = list() counter = 0 for vote in c_month_votes: if counter % 10000 == 0: print((counter / len(c_month_votes) * 100), "%") counter += 1 for date_tuple in self.post_date_index: if vote[1] in date_tuple[1]: split_date = date_tuple[0].split('-') temp_date_obj = datetime(int(split_date[0]), int(split_date[1]), 1) reformed_votes.append((vote[0], vote[1], vote[2], temp_date_obj)) reformed_votes.sort(key=itemgetter(3)) tools.save_pickle(self.data_path + "pivot_files/reformed_votes/" + vote_date, reformed_votes) current_date_obj += month_delta print("Reform votes finished.")
def working_on_votes(self): current_date_obj = self.start_date_obj month_delta = relativedelta(months=1) while current_date_obj < self.end_date_obj: vote_date = str(current_date_obj.year) + "-" + str(current_date_obj.month) print(vote_date) c_month_votes = tools.load_pickle(self.data_path + "pivot_files/reformed_votes/" + vote_date) vote_count = 0 start_time = time.time() # The votes are ordered based on the date of the posts they are placed # this way we only need to load each date_to_id_to_post file once per date. post_date = "2008-8" date_to_id_to_post = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + post_date) # We will collect the answers with their responding question date and process them # with the same approach we did with the answers. answer_collection = list() for vote in c_month_votes: if vote_count % 10000 == 0: print("Percentage: ", vote_count/len(c_month_votes) * 100, "%") print("Execution time", time.time() - start_time) start_time = time.time() vote_count += 1 if str(vote[3].year) + "-" + str(vote[3].month) != post_date: post_date = str(vote[3].year) + "-" + str(vote[3].month) date_to_id_to_post = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + post_date) post_tuple = date_to_id_to_post[int(vote[1])] # if question if post_tuple[0] == '1': # if the post is not deleted if post_tuple[1] != -99: # getting a list of the posts tags tag_list = self.parse_tags(post_tuple[2]) if post_tuple[1] in self.date_to_user_score[vote_date]: if vote[2] == '2': self.date_to_user_score[vote_date][post_tuple[1]][0] += 10 for tag in tag_list: if tag in self.date_to_user_score[vote_date][post_tuple[1]][1]: self.date_to_user_score[vote_date][post_tuple[1]][1][tag] += 10 else: self.date_to_user_score[vote_date][post_tuple[1]][1][tag] = 10 else: if vote[2] == '2': temp_tag_dict = dict() for tag in tag_list: temp_tag_dict[tag] = 10 self.date_to_user_score[vote_date][post_tuple[1]] = [10, temp_tag_dict] # if answer if post_tuple[0] == '2': if post_tuple[1] != -99: question_date = self.get_post_date(post_tuple[2]) if question_date != 'no_post': split_date = question_date.split('-') temp_date_obj = datetime(int(split_date[0]), int(split_date[1]), 1) answer_collection.append((post_tuple[0], post_tuple[1], post_tuple[2], vote[2], temp_date_obj)) answer_collection.sort(key=itemgetter(4)) question_date = "2008-8" date_to_id_to_question = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + question_date) for answer in answer_collection: if str(answer[4].year) + "-" + str(answer[4].month) != question_date: question_date = str(answer[4].year) + "-" + str(answer[4].month) date_to_id_to_question = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + question_date) question_tuple = date_to_id_to_question[int(answer[2])] tag_list = self.parse_tags(question_tuple[2]) if answer[1] in self.date_to_user_score[vote_date]: if answer[3] == '1': self.date_to_user_score[vote_date][answer[1]][0] += 15 for tag in tag_list: if tag in self.date_to_user_score[vote_date][answer[1]][1]: self.date_to_user_score[vote_date][answer[1]][1][tag] += 15 else: self.date_to_user_score[vote_date][answer[1]][1][tag] = 15 if answer[3] == '2': self.date_to_user_score[vote_date][answer[1]][0] += 10 for tag in tag_list: if tag in self.date_to_user_score[vote_date][answer[1]][1]: self.date_to_user_score[vote_date][answer[1]][1][tag] += 10 else: self.date_to_user_score[vote_date][answer[1]][1][tag] = 10 else: if answer[3] == '1': temp_tag_dict = dict() for tag in tag_list: temp_tag_dict[tag] = 15 self.date_to_user_score[vote_date][answer[1]] = [15, temp_tag_dict] if answer[3] == '2': temp_tag_dict = dict() for tag in tag_list: temp_tag_dict[tag] = 10 self.date_to_user_score[vote_date][answer[1]] = [10, temp_tag_dict] current_date_obj += month_delta tools.save_pickle(self.data_path + "pivot_files/month_scores", self.date_to_user_score)
def active_users(self): start_date_obj = parse("2008-08-01T00:00:00.000") end_date_obj = parse("2021-01-01T00:00:00.000") m_delta = relativedelta(months=1) date_strings = list() questions_rec = list() answer_rec = list() post_users_rec = list() score_users_rec = list() year_users_dict = dict() # creating the dates for the plot current_date_obj = start_date_obj while current_date_obj < end_date_obj: date_strings.append(current_date_obj) current_date_obj += m_delta m_dates = matplotlib.dates.date2num(date_strings) while start_date_obj < end_date_obj: year_month_str = str(start_date_obj.year) + "-" + str( start_date_obj.month) # the score and tags for each user(uid) for every month month_user_scores = tools.load_pickle(self.path + "Month_Analysis/Scores/" + year_month_str) month_posts = tools.load_pickle(self.path + "Month_Analysis/Posts/" + year_month_str) m_questions = 0 m_answers = 0 m_active_users = set() for record in month_posts: if record[0] == '1': m_questions += 1 if record[0] == '2': m_answers += 1 m_active_users.add(record[1]) questions_rec.append(m_questions) answer_rec.append(m_answers) post_users_rec.append(len(m_active_users)) score_users_rec.append(len(month_user_scores)) if str(start_date_obj.year) in year_users_dict: year_users_dict[str(start_date_obj.year)] = \ year_users_dict[str(start_date_obj.year)].union(m_active_users) else: year_users_dict[str(start_date_obj.year)] = m_active_users start_date_obj += m_delta tools.save_pickle(self.path + "year_active_users", year_users_dict) fig, ax1 = plt.subplots() color = '#000000' ax1.set_xlabel('Date') ax1.set_ylabel('Number of Posts', color=color) ax1.plot_date(m_dates, questions_rec, 'None', color="tab:red", label="Users made a Post") ax1.plot_date(m_dates, questions_rec, 'None', color="tab:orange", label="Users received Score") ax1.plot_date(m_dates, questions_rec, 'b-', color="tab:blue", label="Questions") ax1.plot_date(m_dates, answer_rec, 'b-', color="tab:green", label="Answers") ax1.tick_params(axis='y', labelcolor=color) plt.legend() axes = plt.gca() axes.set_ylim([0, 330000]) ax2 = ax1.twinx( ) # instantiate a second axes that shares the same x-axis color = 'tab:red' ax2.set_ylabel('Active Users', color=color) # we already handled the x-label with ax1 ax2.plot_date(m_dates, post_users_rec, 'b-', color="tab:red", label="Users made a Post") ax2.plot_date(m_dates, score_users_rec, 'b-', color="tab:orange", label="Users received Score") axes = plt.gca() axes.set_ylim([0, 440000]) ax2.tick_params(axis='y', labelcolor=color) fig.tight_layout() # otherwise the right y-label is slightly clipped plt.title("Posts and Active Users per Month") plt.xticks(rotation=45) plt.savefig(self.path + "users.png", bbox_inches='tight', format="png", dpi=300)