def badge_processing(site_name, directory, database): site = [] index = [] UserId = [] BadgeName = [] Class = [] BadgeDate = [] for event,elem in ET.iterparse(os.path.join(directory, "Badges.xml")): if event == "end": try: ind = int(elem.attrib["Id"]) userid = int(elem.attrib["UserId"]) badgename = elem.attrib["Name"] badgeclass = elem.attrib["Class"] badgedate = elem.attrib["Date"] site.append(site_name) index.append(ind) UserId.append(userid) BadgeName.append(badgename) Class.append(badgeclass) BadgeDate.append(badgedate) elem.clear() except Exception as e: pass df =pd.DataFrame({"Site": site, "BadgeId":index, "UserId":UserId,"BadgeName":BadgeName,"BadgeClass": Class, "BadgeDate":BadgeDate}) write_table(database, "Badges", df) statistics_file = os.path.join(Path(database).parent, "statistics.log") log(statistics_file,"# users having badges: %d" % len(df))
def remove_site(site, database): statistics_file = os.path.join(Path(database).parent, "statistics.log") log(statistics_file, "Removing old database entries of site " + site) tables = [ "AnswerMeta", "AnswerText", "Badges", "Comments", "FormulasComments", "FormulasPosts", "PostIdRelatedPostId", "QuestionAcceptedAnswer", "QuestionTags", "QuestionText", "QuestionMeta", "Users", "Tags" ] DB = sqlite3.connect(database) cursor = DB.cursor() for table in tables: cursor.execute("DELETE FROM '" + table + "' WHERE site = '" + site + "'") tables = ["FormulasPostsMathML", "FormulasCommentsMathML"] for table in tables: cursor.execute( "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='" + table + "' ") #if the count is 1, then table exists if cursor.fetchone()[0] == 1: cursor.execute("DELETE FROM '" + table + "' WHERE site = '" + site + "'") DB.commit() DB.close()
def duplicate_questions(database, df): statistics_file = os.path.join(Path(database).parent, "statistics.log") result = df.groupby("LinkTypeId") for k, v in result: d = pd.DataFrame({ "QuestionId": v["PostId"], "RelatedQuestionId": v["RelatedPostId"] }) if k == 3: log(statistics_file, "# duplicate questions: %d" % len(d)) file_name = "DuplicateQuestions" if k == 1: log(statistics_file, "# related questions: %d" % len(d)) file_name = "RelatedQuestionsSource2Target" write_table(database, file_name, d)
def process_question_acceptedanswer(site_name, questions, database): statistics_file = os.path.join(Path(database).parent, "statistics.log") sites = [] questionId = [] acceptedAnswerId = [] for qid, aid in zip(questions["QuestionId"], questions["AcceptedAnswerId"]): if aid: sites.append(site_name) questionId.append(qid) acceptedAnswerId.append(aid) df = pd.DataFrame({ "Site": sites, "QuestionId": questionId, "AcceptedAnswerId": acceptedAnswerId }) write_table(database, "QuestionAcceptedAnswer", df) log(statistics_file, "# question-acceptedAnswer pairs: %d" % len(df))
def posts_processing(site_name, directory, database): questions = init_posts(PostTypeId=1) answers = init_posts(PostTypeId=2) for event, elem in ET.iterparse(os.path.join(directory, "Posts.xml")): if event == "end": try: # print elem.tag,elem.attrib process_element(questions, elem, PostTypeId=1) process_element(answers, elem, PostTypeId=2) elem.clear() except Exception as e: pass # print("Exception: %s" % e) statistics_file = os.path.join(Path(database).parent, "statistics.log") log( statistics_file, "# posts: " + str(len(questions["QuestionId"]) + len(answers["AnswerId"]))) log(statistics_file, "# questions: " + str(len(questions["QuestionId"]))) log(statistics_file, "# answers: " + str(len(answers["AnswerId"]))) process_question_text(site_name, questions, database, directory) process_question_tags(site_name, questions, database) process_question_acceptedanswer(site_name, questions, database) process_question_meta(site_name, questions, database) questions.clear() # 'clear questions dictionary to free up memory space process_answer_body(site_name, answers, database, directory) process_answer_meta(site_name, answers, database)
def save_to_html(figure_file_p, figure_file_c, df_tokens, df_words, df_stats_p, df_stats_c, df_duplicates, directory, site): df_tokens.index += 1 df_words.index += 1 tokens = df_tokens.to_html(classes='table table-striped', bold_rows=False, justify='center', border=2) words = df_words.to_html(classes='table table-striped', bold_rows=False, justify='center', border=2) df_stats = pd.concat([df_stats_p, df_stats_c]) stats = df_stats.to_html(classes='table table-striped', index=False, justify='center', border=2) duplicates = df_duplicates.to_html(classes='table table-striped', index=False, justify='center', border=2) f = open(os.path.join(directory, site + '_stats.html'), 'w') text = """<html><head></head><body> <div style="margin-top:50px"><h1 style="text-align: center;">""" + site + """</h1></div> <div style="float:left; margin:10px"><h3 style="text-align: center;"><br><br>Common Words</h3>""" + words + """</div> <div style="float:left; margin:10px"><h3 style="text-align: center;"><br>Common Formula<br>Tokens</h3>""" + tokens + """</div> <div style="float:left; margin:10px"><h3 style="text-align: center;">Common Formula<br>Duplicates<br>(min. 2 Tokens)</h3>""" + duplicates + """</div> <div style="float:left; margin:10px"> <div style="float:top"><h3 style="text-align: left;"><br><br>&emspFormula Statistics</h3>""" + stats + """</div> <div style="float:top; margin-top:20px">""" + '<img src="' + figure_file_p + '" alt="statistics figure ' + figure_file_p + '" width="600" style=\'border:2px solid #000000\'>' + """</div> <div style="float:top; margin-top:20px">""" + '<img src="' + figure_file_c + '" alt="statistics figure ' + figure_file_c + '" width="600" style=\'border:2px solid #000000\'>' + """</div> </div> </body></html>""" f.write(text) f.close() log("../output/statistics.log", "Wrote file " + os.path.join(directory, site + '_stats.html'))
def comments_formula_processing(site_name, database, directory, context_length): DB = sqlite3.connect(database) comments = pd.read_sql('select CommentId, Text from "Comments" where Site="'+site_name+'"', DB) DB.close() Formulas = {"FormulaId": [], "Site": [], "CommentId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []} #formula_con = {} error_count = 0 starting_formula_index = current_formula_id(database) formula_index = 0 for comment, body in zip(comments["CommentId"], comments["Text"]): if site_name not in DumpDownloader.special_delim: formulas, positions, inline, error = formula_extr(body, site_name) else: formulas, positions, inline, error = formula_extr_special(body, DumpDownloader.special_delim[site_name]) if not error: for formula, position, inl in zip(formulas, positions, inline): Formulas["FormulaId"].append(starting_formula_index+formula_index) Formulas["Site"].append(site_name) Formulas["CommentId"].append(comment) Formulas["LaTeXBody"].append(formula) #Formulas["SLTBody"].append(get_mathml(formula)) Formulas["TokenLength"].append(formula_token_length(formula)) Formulas["StartingPosition"].append(position) Formulas["Inline"].append(inl) #formula_con[starting_formula_index+formula_index] = [int(comment), formula, position, inl] formula_index += 1 else: error_count += 1 if(len(Formulas["FormulaId"])>1000000): df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "CommentId":Formulas["CommentId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]}) write_table(database, 'FormulasComments', df, "append") Formulas = {"FormulaId": [], "Site": [], "CommentId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []} df._clear_item_cache() df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "CommentId":Formulas["CommentId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]}) write_table(database, 'FormulasComments', df) statistics_file = os.path.join(Path(database).parent, "statistics.log") log(statistics_file, str(formula_index) + " formulas parsed from comments") log(statistics_file, str(error_count) + " errors in parsing comment formulas") log(statistics_file, "error rate parsing formulas from comments: " + format(error_count/(len(comments["CommentId"]))*100, ".4f") + " %")
def questions_formula_processing(site_name, database, directory, context_length): DB = sqlite3.connect(database) questions = pd.read_sql('select * from "QuestionText" where Site="'+site_name+'"', DB) DB.close() Formulas = {"FormulaId": [], "Site": [], "PostId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []} #formula_con={} error_count = 0 starting_formula_index = current_formula_id(database) formula_index = 0 # question processing (title and body) for question, title, body in zip(questions["QuestionId"], questions["Title"], questions["Body"]): if site_name not in DumpDownloader.special_delim: formulas_title, positions_title, _, error_title = formula_extr(title, site_name) formulas_body, positions_body, inline, error_body = formula_extr(body, site_name) else: formulas_title, positions_title, _, error_title = formula_extr_special(title, DumpDownloader.special_delim[site_name]) formulas_body, positions_body, inline, error_body = formula_extr_special(body, DumpDownloader.special_delim[site_name]) # parsing errors occur (total of ~6500) do not take formulas from "invalid" texts if not error_title and not error_body: for formula, position in zip(formulas_title, positions_title): Formulas["FormulaId"].append(starting_formula_index+formula_index) Formulas["Site"].append(site_name) Formulas["PostId"].append(int(question)) Formulas["LaTeXBody"].append(formula) #Formulas["SLTBody"].append(get_mathml(formula)) Formulas["TokenLength"].append(formula_token_length(formula)) # position -1 for formulas in title Formulas["StartingPosition"].append(-1) Formulas["Inline"].append(True) #formula_con[starting_formula_index+formula_index] = [int(question), formula, position, inl] formula_index += 1 for formula, position, inl in zip(formulas_body, positions_body, inline): Formulas["FormulaId"].append(starting_formula_index+formula_index) Formulas["Site"].append(site_name) Formulas["PostId"].append(int(question)) Formulas["LaTeXBody"].append(formula) #Formulas["SLTBody"].append(get_mathml(formula)) Formulas["TokenLength"].append(formula_token_length(formula)) Formulas["StartingPosition"].append(position) Formulas["Inline"].append(inl) #formula_con[starting_formula_index+formula_index] = [int(question), formula, position, inl] formula_index += 1 else: error_count += 1 if(len(Formulas["FormulaId"])>1000000): df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "PostId":Formulas["PostId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]}) write_table(database, 'FormulasPosts', df) Formulas = {"FormulaId": [], "Site": [], "PostId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []} df._clear_item_cache() df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "PostId":Formulas["PostId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]}) write_table(database, 'FormulasPosts', df) statistics_file = os.path.join(Path(database).parent, "statistics.log") log(statistics_file, str(formula_index) + " formulas parsed from questions") log(statistics_file, str(error_count) + " errors in parsing question formulas") log(statistics_file, "error rate parsing formulas from questions: " + format(error_count/(len(questions["QuestionId"]))*100, ".4f") + " %")
def context_main(filename_dumps, dump_directory, database, x, n, corpus, tablename, tfidf, all, stopwords): statistics_file = os.path.join(Path(database).parent, "statistics.log") start = time.time() with open(filename_dumps) as f: sites = [line.rstrip() for line in f if line is not ""] log(statistics_file, "#################################################") log(statistics_file, "context.py") log(statistics_file, "input: " + dump_directory + ", x - " + str(x) + ", n - "+ str(n) + ", corpus - " + corpus + ", tfidf - " + tfidf + ", all - "+ all) log(statistics_file, "output: "+ database + " - Table " + tablename + ", " + statistics_file) log(statistics_file, "dumps: " + str(sites)) log(statistics_file, "-------------------------") downloader = DumpDownloader() directories = [os.path.join(dump_directory, downloader.get_file_name(site)).replace(".7z", "/") for site in sites] if all == 'yes': all = True else: all = False if tfidf== 'yes': tfidf = True else: tfidf=False if stopwords == 'none': stopwords = None elif stopwords == 'english': stopwords = 'english' else: try: with open(stopwords) as f: stopwords = [line.rstrip() for line in f if line is not ""] print("Stopwords: " + stopwords.__str__()) except Exception as e: print("File " + stopwords + " not found or error while parsing. Using no stopwords") stopwords = None if not all: try: if not ((corpus == "all") | (corpus == "individual")): raise ValueError except ValueError: sys.exit("option for --corpus must be 'all' or 'individual'") if corpus == "all": print("Calculating idf values of all sites texts") t1 = time.time() bow = calculate_idf(sites, directories, database, stopwords) log(statistics_file, "time calculating idf scores: "+ str(int((time.time()-t1)/60)) +"min " + str(int((time.time()-t1)%60)) + "sec") if_exists = "replace" for site, directory in zip(sites, directories): try: if not os.path.exists(directory): raise OSError except OSError: print(directory + " not found") if (corpus == "individual") and not all: print("Calculating idf values of texts of site "+ site) t1 = time.time() bow = calculate_idf([site], directories, database, stopwords) log(statistics_file, "time calculating idf scores: "+ str(int((time.time()-t1)/60)) +"min " + str(int((time.time()-t1)%60)) + "sec") # for each formula # get context # calculate top n context # save in table in database (option for other table name) contexts, docs = posts_context(directory, database, site, x, all) t1 = time.time() if all: top_n_contexts = {} for postid in contexts: for id, context in contexts[postid].items(): top_n_contexts[id] = " ".join(context) else: top_n_contexts = bow.get_top_n_tfidf2(contexts, docs, n, tfidf, all) log(statistics_file, "time for contexts posts: "+ str(int((time.time()-t1)/60)) +"min " + str(int((time.time()-t1)%60)) + "sec") write_context_table(site, top_n_contexts, database, tablename, if_exists) if_exists = "append" contexts, docs = comments_context(directory, database, site, x, all) t1 = time.time() if all: top_n_contexts = {} for commentid in contexts: for id, context in contexts[commentid].items(): top_n_contexts[id] = " ".join(context) else: top_n_contexts = bow.get_top_n_tfidf2(contexts, docs, n, tfidf, all) log(statistics_file, "time for contexts comments: "+ str(int((time.time()-t1)/60)) +"min " + str(int((time.time()-t1)%60)) + "sec") write_context_table(site, top_n_contexts, database, tablename, if_exists) log(statistics_file, "-------------------------") log(statistics_file, "total execution time: "+ str(int((time.time()-start)/60)) +"min " + str(int((time.time()-start)%60)) + "sec") log(statistics_file, "max memory usage: " + format((resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)/pow(2,30), ".3f")+ " GigaByte") log(statistics_file, "#################################################")
def main(filename_dumps, database, directory): statistics_file = os.path.join(directory, "statistics.log") with open(filename_dumps) as f: sites = [line.rstrip() for line in f if line is not ""] start = time.time() log(statistics_file, "#################################################") log(statistics_file, "statistics.py") log(statistics_file, "input: " + database) log(statistics_file, "output: " + directory + ", " + statistics_file) log(statistics_file, "dumps: " + str(sites)) log(statistics_file, "-------------------------") # set plot style (seaborn) sns.set_theme(context="paper") for site in sites: DB = sqlite3.connect(database) formulas_posts = pd.read_sql( 'select FormulaId, PostId, TokenLength from "FormulasPosts" where Site="' + site + '"', DB) question_ids = pd.read_sql( 'select QuestionId from "QuestionTags" where Site="' + site + '"', DB) answer_ids = pd.read_sql( 'select AnswerId from "AnswerMeta" where Site="' + site + '"', DB) DB.close() post_ids = list(question_ids["QuestionId"]) + list( answer_ids["AnswerId"]) question_ids.pop("QuestionId") answer_ids.pop("AnswerId") figure_file_p, df_stats_p = formulas_per_post( dict(zip(formulas_posts["FormulaId"], formulas_posts["PostId"])), post_ids, list(formulas_posts["TokenLength"]), site, directory, "post") formulas_posts.pop("FormulaId") formulas_posts.pop("PostId") #print("max memory usage: " + format((resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)/pow(2,30), ".3f")+ " GigaByte") DB = sqlite3.connect(database) question_texts = pd.read_sql( 'select Title, Body from "QuestionText" where Site="' + site + '"', DB) answer_texts = pd.read_sql( 'select Body from "AnswerText" where Site="' + site + '"', DB) DB.close() df_words = common_words( list(question_texts["Title"]) + list(question_texts["Body"]) + list(answer_texts["Body"]), 100) question_texts.pop("Title") question_texts.pop("Body") answer_texts.pop("Body") #print("max memory usage: " + format((resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)/pow(2,30), ".3f")+ " GigaByte") DB = sqlite3.connect(database) formulas_comments = pd.read_sql( 'select FormulaId, CommentId, TokenLength from "FormulasComments" where Site="' + site + '"', DB) comment_ids = pd.read_sql( 'select CommentId from "Comments" where Site="' + site + '"', DB) DB.close() figure_file_c, df_stats_c = formulas_per_post( dict( zip(formulas_comments["FormulaId"], formulas_comments["CommentId"])), list(comment_ids["CommentId"]), list(formulas_comments["TokenLength"]), site, directory, "comment") comment_ids.pop("CommentId") formulas_comments.pop("FormulaId") formulas_comments.pop("CommentId") DB = sqlite3.connect(database) formulas_posts = pd.read_sql( 'select LaTeXBody from "FormulasPosts" where Site="' + site + '"', DB) formulas_comments = pd.read_sql( 'select LaTeXBody from "FormulasComments" where Site="' + site + '"', DB) DB.close() all_formulas = list(formulas_posts["LaTeXBody"]) + list( formulas_comments["LaTeXBody"]) formulas_posts.pop("LaTeXBody") formulas_comments.pop("LaTeXBody") df_tokens = common_tokens(all_tokens(all_formulas), 100) all_formulas = [] DB = sqlite3.connect(database) formulas_posts = pd.read_sql( 'select LaTeXBody from "FormulasPosts" where Site="' + site + '" and TokenLength>"1"', DB) formulas_comments = pd.read_sql( 'select LaTeXBody from "FormulasComments" where Site="' + site + '" and TokenLength>"1"', DB) DB.close() all_formulas = list(formulas_posts["LaTeXBody"]) + list( formulas_comments["LaTeXBody"]) formulas_posts.pop("LaTeXBody") formulas_comments.pop("LaTeXBody") df_duplicates = duplicate_formulas(all_formulas, 100) print("max memory usage: " + format((resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) / pow(2, 30), ".3f") + " GigaByte") DB = sqlite3.connect(database) tags = pd.read_sql( 'select Tag, Count from "Tags" where Site="' + site + '" ORDER BY Count DESC limit 100', DB) DB.close() tags_histo_all( tags, "200 Most Frequent Tags Distribution in '" + site + "'", os.path.join(directory, "diagrams", site + "_tags_desc.png")) DB = sqlite3.connect(database) top_tags = pd.read_sql( 'select Tag, Count from "Tags" where Site="' + site + '" ORDER BY Count DESC limit 20', DB) DB.close() tags_histo(top_tags, "20 Most Frequent Tags in '" + site + "'", os.path.join(directory, "diagrams", site + "_top_tags.png")) DB = sqlite3.connect(database) bottom_tags = pd.read_sql( 'select Tag, Count from "Tags" where Site="' + site + '" AND Count > 3 ORDER BY Count ASC limit 20', DB) DB.close() tags_histo( bottom_tags, "20 Least Frequent Tags in '" + site + "'", os.path.join(directory, "diagrams", site + "_bottom_tags.png")) save_to_html(figure_file_p, figure_file_c, df_tokens, df_words, df_stats_p, df_stats_c, df_duplicates, directory, site) log(statistics_file, "-------------------------") log( statistics_file, "total execution time: " + str(int((time.time() - start) / 60)) + "min " + str(int((time.time() - start) % 60)) + "sec") log( statistics_file, "max memory usage: " + format( (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) / pow(2, 30), ".3f") + " GigaByte") log(statistics_file, "#################################################")
def formulas_per_post(formulaid_postid, all_postids, token_lengths, site, directory, text_type): posts = len(all_postids) formulas = len(formulaid_postid) stats_titles = [] stats_values = [] stats_titles.append("total " + text_type + "s: ") stats_values.append(format(posts, ',d')) stats_titles.append("total formulas in " + text_type + "s: ") stats_values.append(format(formulas, ',d')) stats_titles.append("average number of formulas per " + text_type + ": ") stats_values.append(format(formulas / posts, ".2f")) # percentage of posts with at least 1 formula unique_postids = len(set(formulaid_postid.values())) stats_titles.append("total number of " + text_type + "s containing formulas: ") stats_values.append(format(unique_postids, ',d')) stats_titles.append("percentage of " + text_type + "s containing formulas: ") stats_values.append(format(100 * unique_postids / posts, ".2f")) # number of formulas in post {postid: #formulas} counter = Counter(formulaid_postid.values()) # number of posts with x formulas {x: #posts} counts_counter = Counter(counter.values()) # manually add number of posts with 0 formulas counts_counter[0] = posts - unique_postids rev_ordered_counts_counter = collections.OrderedDict( sorted(counts_counter.items(), reverse=True)) to_remove = 0.02 * posts removed = 0 top = [] for k, v in rev_ordered_counts_counter.items(): if k < 20: break if removed <= to_remove: top.append(counts_counter.pop(k)) removed += v else: break prev = 0 for k in sorted(counts_counter.keys()): while k > prev + 1: prev += 1 counts_counter[prev] = 0 prev = k fig, (ax1, ax2) = plt.subplots(2, 1) # make this into a histogram of number of formula distribution in questions, answers, posts and comments #plt.subplot(2, 1, 1) #fig.subplots_adjust(left=0.2, hspace=0.55, wspace=0.3) o_counts_counter = OrderedDict(sorted(counts_counter.items())) labels = [str(k) for k in o_counts_counter.keys()] + ["x"] o_counts_counter[len(o_counts_counter)] = removed ax1.bar(labels, o_counts_counter.values(), color='darkgreen', edgecolor='black', linewidth=1) ax1.set_title("Formula Distribution of '" + site + "' in " + text_type.title() + "s") ax1.set_xlabel("Number of Formulas per " + text_type.title()) ax1.set_ylabel("Number of " + text_type.title() + "s") plt.sca(ax1) labels.remove("x") maximum = labels[len(labels) - 1] if int(maximum) < 20: locations = reduce_labels(labels) labels = locations else: locations = reduce_labels(labels) if (locations[-1] == labels[-1]) & (len(locations) > 5): locations.remove(labels[-1]) labels.remove(labels[-1]) locations.append("x") labels = reduce_labels(labels) labels.append(r'$\geq$' + str(int(maximum) + 1)) plt.xticks(locations, labels) plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}')) counter = Counter(sorted(token_lengths)) counter[0] = 0 removed = 0 rev_ordered_counter = collections.OrderedDict( sorted(counter.items(), reverse=True)) to_remove = 0.05 * len(token_lengths) top = [] for k, v in rev_ordered_counter.items(): if k < 20: break if removed <= to_remove: top.append(counter.pop(k)) removed += v else: break prev = 0 for k in sorted(counter.keys()): while k > prev + 1: prev += 1 counter[prev] = 0 prev = k ordered_counter = collections.OrderedDict(sorted(counter.items())) labels = [str(k) for k in ordered_counter.keys()] + ["x"] ordered_counter[max(ordered_counter.keys()) + 1] = removed ax2.bar(labels, ordered_counter.values(), color='darkgreen', edgecolor='black', linewidth=1) ax2.set_title("Formula Length Distribution of " + text_type.title() + "s in '" + site + "'") ax2.set_xlabel("Number of Tokens per Formula") ax2.set_ylabel("Number of Formulas") plt.sca(ax2) labels.remove("x") maximum = labels[len(labels) - 1] locations = reduce_labels(labels) if (locations[-1] == labels[-1]) & (len(locations) > 5): locations.remove(labels[-1]) labels.remove(labels[-1]) locations.append("x") labels = reduce_labels(labels) labels.append(r'$\geq$' + str(int(maximum) + 1)) plt.xticks(locations, labels) plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}')) fig.tight_layout() file = os.path.join(directory, "diagrams", site + "_" + text_type + "_stats.png") fig.savefig(file, dpi=400) log("../output/statistics.log", "Figure saved to " + file) return os.path.join("diagrams", site + "_" + text_type + "_stats.png"), pd.DataFrame({ "Title": stats_titles, "Value": stats_values })
def main(dump_directory, filename_dumps, download, extract, database, force_process): if not os.path.isdir(os.path.dirname(database)): os.mkdir(os.path.dirname(database)) statistics_file = os.path.join(Path(database).parent, "statistics.log") start = time.time() log(statistics_file, "#################################################") log(statistics_file, "main.py") log(statistics_file, "input: " + dump_directory) log(statistics_file, "output: " + database + statistics_file) log(statistics_file, "dumps: " + filename_dumps) log(statistics_file, "-------------------------") sites, directories, files = dumps(dump_directory, filename_dumps, download, extract) dump_processing.database.create_tables(database) DB = sqlite3.connect(database) sites_hashs = pd.read_sql('select * from "SiteFileHash"', DB) DB.close() bag_of_words = BOW() first = True for site, dir, file in zip(sites, directories, files): log(statistics_file, "Processing site " + site) if extract == "yes": with open(file, 'rb') as f: hasher = hashlib.md5() for chunk in iter(lambda: f.read(128 * hasher.block_size), b''): hasher.update(chunk) hash = hasher.hexdigest() exists = sites_hashs[sites_hashs["Site"] == site].any()[0] if exists: old_hash = sites_hashs["MD5Hash"][sites_hashs[ sites_hashs["Site"] == site].index.values[0]] else: old_hash = "" if (hash != old_hash) | (force_process == "yes"): dump_processing.database.remove_site(site, database) dump_processing.process_dump.processing_main( site, dir, database, 7) save_hash(database, site, hash, exists) else: dump_processing.database.remove_site(site, database) dump_processing.process_dump.processing_main( site, dir, database, 7) # TODO: highlighted, bold etc words log(statistics_file, "-------------------------") log( statistics_file, "total execution time: " + str(int((time.time() - start) / 60)) + "min " + str(int((time.time() - start) % 60)) + "sec") log( statistics_file, "max memory usage: " + format( (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) / pow(2, 30), ".3f") + " GigaByte") log(statistics_file, "#################################################")
def main(filename_dumps, database, mode, threads, tree, comments, offset, total_formulas, output_database): statistics_file = os.path.join( Path(output_database).parent, "parse_formulas.log") if not os.path.exists(statistics_file): with open(statistics_file, 'w'): pass start = time.time() log(statistics_file, "#################################################") log(statistics_file, "parse_formulas.py") log(statistics_file, "input: " + database + ", mode: " + mode + ", " + threads + " threads") log(statistics_file, "output: " + output_database + ", " + statistics_file) with open(filename_dumps) as f: sites = [line.rstrip() for line in f if line != ""] log(statistics_file, "dumps: " + str(sites)) log(statistics_file, "-------------------------") try: threads = int(threads) except: print("An Error occured parsing --threads argument " + threads) create_mathml_tables(output_database) if tree == "yes": tree = True else: tree = False if comments == "yes": comments = True else: comments = False for site in sites: start = time.time() if mode == "cmml": formulas_to_cmml(database, "FormulasPosts", site, threads, tree, offset, total_formulas, output_database) sys.stdout.write('\n') if (comments): formulas_to_cmml(database, "FormulasComments", site, threads, tree, offset, total_formulas, output_database) if mode == "pmml": formulas_to_pmml(database, "FormulasPosts", site, threads, tree, offset, total_formulas, output_database) sys.stdout.write('\n') if (comments): formulas_to_pmml(database, "FormulasComments", site, threads, tree, offset, total_formulas, output_database) if mode == "both": formulas_to_both_ml(database, "FormulasPosts", site, threads, tree, offset, total_formulas, output_database) sys.stdout.write('\n') if (comments): formulas_to_both_ml(database, "FormulasComments", site, threads, tree, offset, total_formulas, output_database) sys.stdout.write('\n' + site + ' finished. Time: ' + str(int((time.time() - start) / 60)) + "min " + str(int((time.time() - start) % 60)) + "sec") log(statistics_file, "\n-------------------------") log( statistics_file, "total execution time: " + str(int((time.time() - start) / 60)) + "min " + str(int((time.time() - start) % 60)) + "sec") log( statistics_file, "max memory usage: " + format( (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) / pow(2, 30), ".3f") + " GigaByte") log(statistics_file, "#################################################")
def comments_processing(site_name, directory, database): comments = { "Site": [], "CommentId": [], "PostId": [], "UserId": [], "Score": [], "Text": [], "CreationDate": [] } comment_index = 0 comments_dict = {} for event, elem in ET.iterparse(os.path.join(directory, "Comments.xml")): if event == "end": try: postid = int(elem.attrib["PostId"]) userid = int(elem.attrib["UserId"]) score = int(elem.attrib["Score"]) creationdate = elem.attrib["CreationDate"] text = elem.attrib["Text"] comments["Site"].append(site_name) comments["CommentId"].append(comment_index) comments["PostId"].append(postid) comments["UserId"].append(userid) comments["Score"].append(score) comments["CreationDate"].append(creationdate) comments["Text"].append(text) elem.clear() #comments_dict[comment_index] = text comment_index += 1 except Exception as e: pass if (len(comments["CommentId"]) > 1000000): df = pd.DataFrame({ "Site": comments["Site"], "CommentId": comments["CommentId"], "PostId": comments["PostId"], "UserId": comments["UserId"], "Score": comments["Score"], "Text": comments["Text"], "CreationDate": comments["CreationDate"] }) write_table(database, 'Comments', df) comments = { "Site": [], "CommentId": [], "PostId": [], "UserId": [], "Score": [], "Text": [], "CreationDate": [] } df = pd.DataFrame({ "Site": comments["Site"], "CommentId": comments["CommentId"], "PostId": comments["PostId"], "UserId": comments["UserId"], "Score": comments["Score"], "Text": comments["Text"], "CreationDate": comments["CreationDate"] }) write_table(database, 'Comments', df) statistics_file = os.path.join(Path(database).parent, "statistics.log") log(statistics_file, "# comments: " + str(len(df)))