def badge_processing(site_name, directory, database): site = [] index = [] UserId = [] BadgeName = [] Class = [] BadgeDate = [] for event,elem in ET.iterparse(os.path.join(directory, "Badges.xml")): if event == "end": try: ind = int(elem.attrib["Id"]) userid = int(elem.attrib["UserId"]) badgename = elem.attrib["Name"] badgeclass = elem.attrib["Class"] badgedate = elem.attrib["Date"] site.append(site_name) index.append(ind) UserId.append(userid) BadgeName.append(badgename) Class.append(badgeclass) BadgeDate.append(badgedate) elem.clear() except Exception as e: pass df =pd.DataFrame({"Site": site, "BadgeId":index, "UserId":UserId,"BadgeName":BadgeName,"BadgeClass": Class, "BadgeDate":BadgeDate}) write_table(database, "Badges", df) statistics_file = os.path.join(Path(database).parent, "statistics.log") log(statistics_file,"# users having badges: %d" % len(df))
def process_question_tags(site_name, questions, database): sites = [site_name for i in range(len(questions["QuestionId"]))] df = pd.DataFrame({ "Site": sites, "QuestionId": questions["QuestionId"], "Tags": questions["Tags"] }) write_table(database, "QuestionTags", df) questions.pop("Tags") '''
def process_answer_body(site_name, answers, database, directory): sites = [site_name for i in range(len(answers["AnswerId"]))] df = pd.DataFrame({ "Site": sites, "AnswerId": answers["AnswerId"], "Body": answers["Body"] }) write_table(database, "AnswerText", df) answers.pop("Body")
def process_question_text(site_name, questions, database, directory): sites = [site_name for i in range(len(questions["QuestionId"]))] df = pd.DataFrame({ "Site": sites, "QuestionId": questions["QuestionId"], "Title": questions["Title"], "Body": questions["Body"] }) write_table(database, "QuestionText", df) questions.pop("Title") questions.pop("Body")
def process_answer_meta(site_name, answers, database): sites = [site_name for i in range(len(answers["AnswerId"]))] df = pd.DataFrame({ "Site": sites, "AnswerId": answers["AnswerId"], "QuestionId": answers["QuestionId"], "CreationDate": answers["CreationDate"], "Score": answers["Score"], #"CommentCount": answers["CommentCount"], #"LastEditorUserId": answers["LastEditorUserId"], "OwnerUserId": answers["OwnerUserId"] }) write_table(database, "AnswerMeta", df)
def duplicate_questions(database, df): statistics_file = os.path.join(Path(database).parent, "statistics.log") result = df.groupby("LinkTypeId") for k, v in result: d = pd.DataFrame({ "QuestionId": v["PostId"], "RelatedQuestionId": v["RelatedPostId"] }) if k == 3: log(statistics_file, "# duplicate questions: %d" % len(d)) file_name = "DuplicateQuestions" if k == 1: log(statistics_file, "# related questions: %d" % len(d)) file_name = "RelatedQuestionsSource2Target" write_table(database, file_name, d)
def tags_processing(site_name, directory, database): d = {"Site": [], "Tag": [], "Count": []} for event, elem in ET.iterparse(os.path.join(directory, "Tags.xml")): if event == "end": try: tag = elem.attrib["TagName"] count = int(elem.attrib["Count"]) d["Site"].append(site_name) d["Tag"].append(tag) d["Count"].append(count) elem.clear() except Exception as e: pass df = pd.DataFrame(d) write_table(database, "Tags", df)
def postlinks_processing(site_name, directory, database): d = {"Site": [], "PostId": [], "RelatedPostId": [], "LinkTypeId": []} for event, elem in ET.iterparse(os.path.join(directory, "PostLinks.xml")): if event == "end": try: postid = int(elem.attrib["PostId"]) relatedpostid = int(elem.attrib["RelatedPostId"]) linktypeid = int(elem.attrib["LinkTypeId"]) d["Site"].append(site_name) d["PostId"].append(postid) d["RelatedPostId"].append(relatedpostid) d["LinkTypeId"].append(linktypeid) elem.clear() except Exception as e: pass df = pd.DataFrame(d) write_table(database, "PostIdRelatedPostId", df)
def comments_formula_processing(site_name, database, directory, context_length): DB = sqlite3.connect(database) comments = pd.read_sql('select CommentId, Text from "Comments" where Site="'+site_name+'"', DB) DB.close() Formulas = {"FormulaId": [], "Site": [], "CommentId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []} #formula_con = {} error_count = 0 starting_formula_index = current_formula_id(database) formula_index = 0 for comment, body in zip(comments["CommentId"], comments["Text"]): if site_name not in DumpDownloader.special_delim: formulas, positions, inline, error = formula_extr(body, site_name) else: formulas, positions, inline, error = formula_extr_special(body, DumpDownloader.special_delim[site_name]) if not error: for formula, position, inl in zip(formulas, positions, inline): Formulas["FormulaId"].append(starting_formula_index+formula_index) Formulas["Site"].append(site_name) Formulas["CommentId"].append(comment) Formulas["LaTeXBody"].append(formula) #Formulas["SLTBody"].append(get_mathml(formula)) Formulas["TokenLength"].append(formula_token_length(formula)) Formulas["StartingPosition"].append(position) Formulas["Inline"].append(inl) #formula_con[starting_formula_index+formula_index] = [int(comment), formula, position, inl] formula_index += 1 else: error_count += 1 if(len(Formulas["FormulaId"])>1000000): df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "CommentId":Formulas["CommentId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]}) write_table(database, 'FormulasComments', df, "append") Formulas = {"FormulaId": [], "Site": [], "CommentId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []} df._clear_item_cache() df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "CommentId":Formulas["CommentId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]}) write_table(database, 'FormulasComments', df) statistics_file = os.path.join(Path(database).parent, "statistics.log") log(statistics_file, str(formula_index) + " formulas parsed from comments") log(statistics_file, str(error_count) + " errors in parsing comment formulas") log(statistics_file, "error rate parsing formulas from comments: " + format(error_count/(len(comments["CommentId"]))*100, ".4f") + " %")
def process_question_acceptedanswer(site_name, questions, database): statistics_file = os.path.join(Path(database).parent, "statistics.log") sites = [] questionId = [] acceptedAnswerId = [] for qid, aid in zip(questions["QuestionId"], questions["AcceptedAnswerId"]): if aid: sites.append(site_name) questionId.append(qid) acceptedAnswerId.append(aid) df = pd.DataFrame({ "Site": sites, "QuestionId": questionId, "AcceptedAnswerId": acceptedAnswerId }) write_table(database, "QuestionAcceptedAnswer", df) log(statistics_file, "# question-acceptedAnswer pairs: %d" % len(df))
def users_processing(site_name, directory, database): d = {"Site": [], "UserId": [], "Reputation": []} user_set = set() for event, elem in ET.iterparse(os.path.join(directory, "Users.xml")): if event == "end": try: userid = int(elem.attrib["AccountId"]) reputation = int(elem.attrib["Reputation"]) if userid not in user_set: d["Site"].append(site_name) user_set.add(userid) d["UserId"].append(userid) d["Reputation"].append(reputation) elem.clear() except Exception as e: pass df = pd.DataFrame(d) write_table(database, "Users", df)
def questions_formula_processing(site_name, database, directory, context_length): DB = sqlite3.connect(database) questions = pd.read_sql('select * from "QuestionText" where Site="'+site_name+'"', DB) DB.close() Formulas = {"FormulaId": [], "Site": [], "PostId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []} #formula_con={} error_count = 0 starting_formula_index = current_formula_id(database) formula_index = 0 # question processing (title and body) for question, title, body in zip(questions["QuestionId"], questions["Title"], questions["Body"]): if site_name not in DumpDownloader.special_delim: formulas_title, positions_title, _, error_title = formula_extr(title, site_name) formulas_body, positions_body, inline, error_body = formula_extr(body, site_name) else: formulas_title, positions_title, _, error_title = formula_extr_special(title, DumpDownloader.special_delim[site_name]) formulas_body, positions_body, inline, error_body = formula_extr_special(body, DumpDownloader.special_delim[site_name]) # parsing errors occur (total of ~6500) do not take formulas from "invalid" texts if not error_title and not error_body: for formula, position in zip(formulas_title, positions_title): Formulas["FormulaId"].append(starting_formula_index+formula_index) Formulas["Site"].append(site_name) Formulas["PostId"].append(int(question)) Formulas["LaTeXBody"].append(formula) #Formulas["SLTBody"].append(get_mathml(formula)) Formulas["TokenLength"].append(formula_token_length(formula)) # position -1 for formulas in title Formulas["StartingPosition"].append(-1) Formulas["Inline"].append(True) #formula_con[starting_formula_index+formula_index] = [int(question), formula, position, inl] formula_index += 1 for formula, position, inl in zip(formulas_body, positions_body, inline): Formulas["FormulaId"].append(starting_formula_index+formula_index) Formulas["Site"].append(site_name) Formulas["PostId"].append(int(question)) Formulas["LaTeXBody"].append(formula) #Formulas["SLTBody"].append(get_mathml(formula)) Formulas["TokenLength"].append(formula_token_length(formula)) Formulas["StartingPosition"].append(position) Formulas["Inline"].append(inl) #formula_con[starting_formula_index+formula_index] = [int(question), formula, position, inl] formula_index += 1 else: error_count += 1 if(len(Formulas["FormulaId"])>1000000): df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "PostId":Formulas["PostId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]}) write_table(database, 'FormulasPosts', df) Formulas = {"FormulaId": [], "Site": [], "PostId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []} df._clear_item_cache() df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "PostId":Formulas["PostId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]}) write_table(database, 'FormulasPosts', df) statistics_file = os.path.join(Path(database).parent, "statistics.log") log(statistics_file, str(formula_index) + " formulas parsed from questions") log(statistics_file, str(error_count) + " errors in parsing question formulas") log(statistics_file, "error rate parsing formulas from questions: " + format(error_count/(len(questions["QuestionId"]))*100, ".4f") + " %")
def comments_processing(site_name, directory, database): comments = { "Site": [], "CommentId": [], "PostId": [], "UserId": [], "Score": [], "Text": [], "CreationDate": [] } comment_index = 0 comments_dict = {} for event, elem in ET.iterparse(os.path.join(directory, "Comments.xml")): if event == "end": try: postid = int(elem.attrib["PostId"]) userid = int(elem.attrib["UserId"]) score = int(elem.attrib["Score"]) creationdate = elem.attrib["CreationDate"] text = elem.attrib["Text"] comments["Site"].append(site_name) comments["CommentId"].append(comment_index) comments["PostId"].append(postid) comments["UserId"].append(userid) comments["Score"].append(score) comments["CreationDate"].append(creationdate) comments["Text"].append(text) elem.clear() #comments_dict[comment_index] = text comment_index += 1 except Exception as e: pass if (len(comments["CommentId"]) > 1000000): df = pd.DataFrame({ "Site": comments["Site"], "CommentId": comments["CommentId"], "PostId": comments["PostId"], "UserId": comments["UserId"], "Score": comments["Score"], "Text": comments["Text"], "CreationDate": comments["CreationDate"] }) write_table(database, 'Comments', df) comments = { "Site": [], "CommentId": [], "PostId": [], "UserId": [], "Score": [], "Text": [], "CreationDate": [] } df = pd.DataFrame({ "Site": comments["Site"], "CommentId": comments["CommentId"], "PostId": comments["PostId"], "UserId": comments["UserId"], "Score": comments["Score"], "Text": comments["Text"], "CreationDate": comments["CreationDate"] }) write_table(database, 'Comments', df) statistics_file = os.path.join(Path(database).parent, "statistics.log") log(statistics_file, "# comments: " + str(len(df)))