def badge_processing(site_name, directory, database):
    site = []
    index = []
    UserId = []
    BadgeName = []
    Class = []
    BadgeDate = []

    for event,elem in ET.iterparse(os.path.join(directory, "Badges.xml")):
        if event == "end":
            try:
                ind = int(elem.attrib["Id"])
                userid = int(elem.attrib["UserId"])
                badgename = elem.attrib["Name"]
                badgeclass = elem.attrib["Class"]
                badgedate = elem.attrib["Date"]
                site.append(site_name)
                index.append(ind)
                UserId.append(userid)
                BadgeName.append(badgename)
                Class.append(badgeclass)
                BadgeDate.append(badgedate)
                elem.clear()
            except Exception as e:
                pass

    df =pd.DataFrame({"Site": site, "BadgeId":index, "UserId":UserId,"BadgeName":BadgeName,"BadgeClass": Class, "BadgeDate":BadgeDate})
    write_table(database, "Badges", df)
    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    log(statistics_file,"# users having badges: %d" % len(df))
def process_question_tags(site_name, questions, database):
    sites = [site_name for i in range(len(questions["QuestionId"]))]
    df = pd.DataFrame({
        "Site": sites,
        "QuestionId": questions["QuestionId"],
        "Tags": questions["Tags"]
    })
    write_table(database, "QuestionTags", df)
    questions.pop("Tags")
    '''
def process_answer_body(site_name, answers, database, directory):
    sites = [site_name for i in range(len(answers["AnswerId"]))]
    df = pd.DataFrame({
        "Site": sites,
        "AnswerId": answers["AnswerId"],
        "Body": answers["Body"]
    })
    write_table(database, "AnswerText", df)

    answers.pop("Body")
def process_question_text(site_name, questions, database, directory):
    sites = [site_name for i in range(len(questions["QuestionId"]))]
    df = pd.DataFrame({
        "Site": sites,
        "QuestionId": questions["QuestionId"],
        "Title": questions["Title"],
        "Body": questions["Body"]
    })
    write_table(database, "QuestionText", df)

    questions.pop("Title")
    questions.pop("Body")
def process_answer_meta(site_name, answers, database):
    sites = [site_name for i in range(len(answers["AnswerId"]))]
    df = pd.DataFrame({
        "Site": sites,
        "AnswerId": answers["AnswerId"],
        "QuestionId": answers["QuestionId"],
        "CreationDate": answers["CreationDate"],
        "Score": answers["Score"],
        #"CommentCount": answers["CommentCount"],
        #"LastEditorUserId": answers["LastEditorUserId"],
        "OwnerUserId": answers["OwnerUserId"]
    })
    write_table(database, "AnswerMeta", df)
def duplicate_questions(database, df):
    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    result = df.groupby("LinkTypeId")
    for k, v in result:
        d = pd.DataFrame({
            "QuestionId": v["PostId"],
            "RelatedQuestionId": v["RelatedPostId"]
        })
        if k == 3:
            log(statistics_file, "# duplicate questions: %d" % len(d))
            file_name = "DuplicateQuestions"
        if k == 1:
            log(statistics_file, "# related questions: %d" % len(d))
            file_name = "RelatedQuestionsSource2Target"
        write_table(database, file_name, d)
示例#7
0
def tags_processing(site_name, directory, database):
    d = {"Site": [], "Tag": [], "Count": []}
    for event, elem in ET.iterparse(os.path.join(directory, "Tags.xml")):
        if event == "end":
            try:
                tag = elem.attrib["TagName"]
                count = int(elem.attrib["Count"])
                d["Site"].append(site_name)
                d["Tag"].append(tag)
                d["Count"].append(count)
                elem.clear()
            except Exception as e:
                pass

    df = pd.DataFrame(d)
    write_table(database, "Tags", df)
def postlinks_processing(site_name, directory, database):
    d = {"Site": [], "PostId": [], "RelatedPostId": [], "LinkTypeId": []}
    for event, elem in ET.iterparse(os.path.join(directory, "PostLinks.xml")):
        if event == "end":
            try:
                postid = int(elem.attrib["PostId"])
                relatedpostid = int(elem.attrib["RelatedPostId"])
                linktypeid = int(elem.attrib["LinkTypeId"])
                d["Site"].append(site_name)
                d["PostId"].append(postid)
                d["RelatedPostId"].append(relatedpostid)
                d["LinkTypeId"].append(linktypeid)
                elem.clear()
            except Exception as e:
                pass

    df = pd.DataFrame(d)
    write_table(database, "PostIdRelatedPostId", df)
示例#9
0
def comments_formula_processing(site_name, database, directory, context_length):
    DB = sqlite3.connect(database)
    comments = pd.read_sql('select CommentId, Text from "Comments" where Site="'+site_name+'"', DB)
    DB.close()

    Formulas = {"FormulaId": [], "Site": [], "CommentId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []}
    #formula_con = {}

    error_count = 0
    starting_formula_index = current_formula_id(database)
    formula_index = 0
    for comment, body in zip(comments["CommentId"], comments["Text"]):
        if site_name not in DumpDownloader.special_delim:
            formulas, positions, inline, error = formula_extr(body, site_name)
        else:
            formulas, positions, inline, error = formula_extr_special(body, DumpDownloader.special_delim[site_name])
        if not error:
            for formula, position, inl in zip(formulas, positions, inline):
                Formulas["FormulaId"].append(starting_formula_index+formula_index)
                Formulas["Site"].append(site_name)
                Formulas["CommentId"].append(comment)
                Formulas["LaTeXBody"].append(formula)
                #Formulas["SLTBody"].append(get_mathml(formula))
                Formulas["TokenLength"].append(formula_token_length(formula))
                Formulas["StartingPosition"].append(position)
                Formulas["Inline"].append(inl)
                #formula_con[starting_formula_index+formula_index] = [int(comment), formula, position, inl]
                formula_index += 1
        else:
            error_count += 1

        if(len(Formulas["FormulaId"])>1000000):
            df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"],  "CommentId":Formulas["CommentId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]})
            write_table(database, 'FormulasComments', df, "append")
            Formulas = {"FormulaId": [], "Site": [], "CommentId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []}
            df._clear_item_cache()

    df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "CommentId":Formulas["CommentId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]})
    write_table(database, 'FormulasComments', df)

    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    log(statistics_file, str(formula_index) + " formulas parsed from comments")
    log(statistics_file, str(error_count) + " errors in parsing comment formulas")
    log(statistics_file, "error rate parsing formulas from comments: " + format(error_count/(len(comments["CommentId"]))*100, ".4f") + " %")
def process_question_acceptedanswer(site_name, questions, database):
    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    sites = []
    questionId = []
    acceptedAnswerId = []
    for qid, aid in zip(questions["QuestionId"],
                        questions["AcceptedAnswerId"]):
        if aid:
            sites.append(site_name)
            questionId.append(qid)
            acceptedAnswerId.append(aid)
    df = pd.DataFrame({
        "Site": sites,
        "QuestionId": questionId,
        "AcceptedAnswerId": acceptedAnswerId
    })

    write_table(database, "QuestionAcceptedAnswer", df)
    log(statistics_file, "# question-acceptedAnswer pairs: %d" % len(df))
def users_processing(site_name, directory, database):
    d = {"Site": [], "UserId": [], "Reputation": []}
    user_set = set()
    for event, elem in ET.iterparse(os.path.join(directory, "Users.xml")):
        if event == "end":
            try:
                userid = int(elem.attrib["AccountId"])
                reputation = int(elem.attrib["Reputation"])
                if userid not in user_set:
                    d["Site"].append(site_name)
                    user_set.add(userid)
                    d["UserId"].append(userid)
                    d["Reputation"].append(reputation)
                elem.clear()
            except Exception as e:
                pass

    df = pd.DataFrame(d)
    write_table(database, "Users", df)
示例#12
0
def questions_formula_processing(site_name, database, directory, context_length):
    DB = sqlite3.connect(database)
    questions = pd.read_sql('select * from "QuestionText" where Site="'+site_name+'"', DB)
    DB.close()

    Formulas = {"FormulaId": [], "Site": [], "PostId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []}
    #formula_con={}
    error_count = 0
    starting_formula_index = current_formula_id(database)
    formula_index = 0

    # question processing (title and body)
    for question, title, body in zip(questions["QuestionId"], questions["Title"], questions["Body"]):
        if site_name not in DumpDownloader.special_delim:
            formulas_title, positions_title, _, error_title = formula_extr(title, site_name)
            formulas_body, positions_body, inline, error_body = formula_extr(body, site_name)
        else:
            formulas_title, positions_title, _, error_title = formula_extr_special(title, DumpDownloader.special_delim[site_name])
            formulas_body, positions_body, inline, error_body = formula_extr_special(body, DumpDownloader.special_delim[site_name])

        # parsing errors occur (total of ~6500) do not take formulas from "invalid" texts
        if not error_title and not error_body:
            for formula, position in zip(formulas_title, positions_title):
                Formulas["FormulaId"].append(starting_formula_index+formula_index)
                Formulas["Site"].append(site_name)
                Formulas["PostId"].append(int(question))
                Formulas["LaTeXBody"].append(formula)
                #Formulas["SLTBody"].append(get_mathml(formula))
                Formulas["TokenLength"].append(formula_token_length(formula))
                # position -1 for formulas in title
                Formulas["StartingPosition"].append(-1)
                Formulas["Inline"].append(True)
                #formula_con[starting_formula_index+formula_index] = [int(question), formula, position, inl]
                formula_index += 1
            for formula, position, inl in zip(formulas_body, positions_body, inline):
                Formulas["FormulaId"].append(starting_formula_index+formula_index)
                Formulas["Site"].append(site_name)
                Formulas["PostId"].append(int(question))
                Formulas["LaTeXBody"].append(formula)
                #Formulas["SLTBody"].append(get_mathml(formula))
                Formulas["TokenLength"].append(formula_token_length(formula))
                Formulas["StartingPosition"].append(position)
                Formulas["Inline"].append(inl)
                #formula_con[starting_formula_index+formula_index] = [int(question), formula, position, inl]
                formula_index += 1
        else:
            error_count += 1

        if(len(Formulas["FormulaId"])>1000000):
            df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "PostId":Formulas["PostId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]})
            write_table(database, 'FormulasPosts', df)
            Formulas = {"FormulaId": [], "Site": [], "PostId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []}
            df._clear_item_cache()

    df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "PostId":Formulas["PostId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]})
    write_table(database, 'FormulasPosts', df)

    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    log(statistics_file, str(formula_index) + " formulas parsed from questions")
    log(statistics_file, str(error_count) + " errors in parsing question formulas")
    log(statistics_file, "error rate parsing formulas from questions: " + format(error_count/(len(questions["QuestionId"]))*100, ".4f") + " %")
def comments_processing(site_name, directory, database):
    comments = {
        "Site": [],
        "CommentId": [],
        "PostId": [],
        "UserId": [],
        "Score": [],
        "Text": [],
        "CreationDate": []
    }
    comment_index = 0

    comments_dict = {}

    for event, elem in ET.iterparse(os.path.join(directory, "Comments.xml")):
        if event == "end":
            try:
                postid = int(elem.attrib["PostId"])
                userid = int(elem.attrib["UserId"])
                score = int(elem.attrib["Score"])
                creationdate = elem.attrib["CreationDate"]
                text = elem.attrib["Text"]

                comments["Site"].append(site_name)
                comments["CommentId"].append(comment_index)
                comments["PostId"].append(postid)
                comments["UserId"].append(userid)
                comments["Score"].append(score)
                comments["CreationDate"].append(creationdate)
                comments["Text"].append(text)
                elem.clear()

                #comments_dict[comment_index] = text

                comment_index += 1
            except Exception as e:
                pass
        if (len(comments["CommentId"]) > 1000000):
            df = pd.DataFrame({
                "Site": comments["Site"],
                "CommentId": comments["CommentId"],
                "PostId": comments["PostId"],
                "UserId": comments["UserId"],
                "Score": comments["Score"],
                "Text": comments["Text"],
                "CreationDate": comments["CreationDate"]
            })
            write_table(database, 'Comments', df)
            comments = {
                "Site": [],
                "CommentId": [],
                "PostId": [],
                "UserId": [],
                "Score": [],
                "Text": [],
                "CreationDate": []
            }

    df = pd.DataFrame({
        "Site": comments["Site"],
        "CommentId": comments["CommentId"],
        "PostId": comments["PostId"],
        "UserId": comments["UserId"],
        "Score": comments["Score"],
        "Text": comments["Text"],
        "CreationDate": comments["CreationDate"]
    })
    write_table(database, 'Comments', df)
    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    log(statistics_file, "# comments: " + str(len(df)))