def badge_processing(site_name, directory, database):
    site = []
    index = []
    UserId = []
    BadgeName = []
    Class = []
    BadgeDate = []

    for event,elem in ET.iterparse(os.path.join(directory, "Badges.xml")):
        if event == "end":
            try:
                ind = int(elem.attrib["Id"])
                userid = int(elem.attrib["UserId"])
                badgename = elem.attrib["Name"]
                badgeclass = elem.attrib["Class"]
                badgedate = elem.attrib["Date"]
                site.append(site_name)
                index.append(ind)
                UserId.append(userid)
                BadgeName.append(badgename)
                Class.append(badgeclass)
                BadgeDate.append(badgedate)
                elem.clear()
            except Exception as e:
                pass

    df =pd.DataFrame({"Site": site, "BadgeId":index, "UserId":UserId,"BadgeName":BadgeName,"BadgeClass": Class, "BadgeDate":BadgeDate})
    write_table(database, "Badges", df)
    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    log(statistics_file,"# users having badges: %d" % len(df))
예제 #2
0
def remove_site(site, database):
    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    log(statistics_file, "Removing old database entries of site " + site)

    tables = [
        "AnswerMeta", "AnswerText", "Badges", "Comments", "FormulasComments",
        "FormulasPosts", "PostIdRelatedPostId", "QuestionAcceptedAnswer",
        "QuestionTags", "QuestionText", "QuestionMeta", "Users", "Tags"
    ]
    DB = sqlite3.connect(database)
    cursor = DB.cursor()

    for table in tables:
        cursor.execute("DELETE FROM '" + table + "' WHERE site = '" + site +
                       "'")

    tables = ["FormulasPostsMathML", "FormulasCommentsMathML"]

    for table in tables:
        cursor.execute(
            "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='"
            + table + "' ")
        #if the count is 1, then table exists
        if cursor.fetchone()[0] == 1:
            cursor.execute("DELETE FROM '" + table + "' WHERE site = '" +
                           site + "'")

    DB.commit()
    DB.close()
def duplicate_questions(database, df):
    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    result = df.groupby("LinkTypeId")
    for k, v in result:
        d = pd.DataFrame({
            "QuestionId": v["PostId"],
            "RelatedQuestionId": v["RelatedPostId"]
        })
        if k == 3:
            log(statistics_file, "# duplicate questions: %d" % len(d))
            file_name = "DuplicateQuestions"
        if k == 1:
            log(statistics_file, "# related questions: %d" % len(d))
            file_name = "RelatedQuestionsSource2Target"
        write_table(database, file_name, d)
def process_question_acceptedanswer(site_name, questions, database):
    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    sites = []
    questionId = []
    acceptedAnswerId = []
    for qid, aid in zip(questions["QuestionId"],
                        questions["AcceptedAnswerId"]):
        if aid:
            sites.append(site_name)
            questionId.append(qid)
            acceptedAnswerId.append(aid)
    df = pd.DataFrame({
        "Site": sites,
        "QuestionId": questionId,
        "AcceptedAnswerId": acceptedAnswerId
    })

    write_table(database, "QuestionAcceptedAnswer", df)
    log(statistics_file, "# question-acceptedAnswer pairs: %d" % len(df))
def posts_processing(site_name, directory, database):

    questions = init_posts(PostTypeId=1)
    answers = init_posts(PostTypeId=2)

    for event, elem in ET.iterparse(os.path.join(directory, "Posts.xml")):
        if event == "end":
            try:
                # print elem.tag,elem.attrib
                process_element(questions, elem, PostTypeId=1)
                process_element(answers, elem, PostTypeId=2)
                elem.clear()
            except Exception as e:
                pass
            # print("Exception: %s" % e)

    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    log(
        statistics_file, "# posts: " +
        str(len(questions["QuestionId"]) + len(answers["AnswerId"])))
    log(statistics_file, "# questions: " + str(len(questions["QuestionId"])))
    log(statistics_file, "# answers: " + str(len(answers["AnswerId"])))

    process_question_text(site_name, questions, database, directory)
    process_question_tags(site_name, questions, database)
    process_question_acceptedanswer(site_name, questions, database)
    process_question_meta(site_name, questions, database)
    questions.clear()  # 'clear questions dictionary to free up memory space
    process_answer_body(site_name, answers, database, directory)
    process_answer_meta(site_name, answers, database)
예제 #6
0
def save_to_html(figure_file_p, figure_file_c, df_tokens, df_words, df_stats_p,
                 df_stats_c, df_duplicates, directory, site):
    df_tokens.index += 1
    df_words.index += 1
    tokens = df_tokens.to_html(classes='table table-striped',
                               bold_rows=False,
                               justify='center',
                               border=2)
    words = df_words.to_html(classes='table table-striped',
                             bold_rows=False,
                             justify='center',
                             border=2)
    df_stats = pd.concat([df_stats_p, df_stats_c])
    stats = df_stats.to_html(classes='table table-striped',
                             index=False,
                             justify='center',
                             border=2)
    duplicates = df_duplicates.to_html(classes='table table-striped',
                                       index=False,
                                       justify='center',
                                       border=2)

    f = open(os.path.join(directory, site + '_stats.html'), 'w')

    text = """<html><head></head><body>
    <div style="margin-top:50px"><h1 style="text-align: center;">""" + site + """</h1></div>
    <div style="float:left; margin:10px"><h3 style="text-align: center;"><br><br>Common Words</h3>""" + words + """</div>
    <div style="float:left; margin:10px"><h3 style="text-align: center;"><br>Common Formula<br>Tokens</h3>""" + tokens + """</div>
    <div style="float:left; margin:10px"><h3 style="text-align: center;">Common Formula<br>Duplicates<br>(min. 2 Tokens)</h3>""" + duplicates + """</div>
    <div style="float:left; margin:10px">
    <div style="float:top"><h3 style="text-align: left;"><br><br>&emspFormula Statistics</h3>""" + stats + """</div>
    <div style="float:top; margin-top:20px">""" + '<img src="' + figure_file_p + '" alt="statistics figure ' + figure_file_p + '" width="600" style=\'border:2px solid #000000\'>' + """</div>
    <div style="float:top; margin-top:20px">""" + '<img src="' + figure_file_c + '" alt="statistics figure ' + figure_file_c + '" width="600" style=\'border:2px solid #000000\'>' + """</div>
    </div>
    </body></html>"""
    f.write(text)
    f.close()

    log("../output/statistics.log",
        "Wrote file " + os.path.join(directory, site + '_stats.html'))
예제 #7
0
def comments_formula_processing(site_name, database, directory, context_length):
    DB = sqlite3.connect(database)
    comments = pd.read_sql('select CommentId, Text from "Comments" where Site="'+site_name+'"', DB)
    DB.close()

    Formulas = {"FormulaId": [], "Site": [], "CommentId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []}
    #formula_con = {}

    error_count = 0
    starting_formula_index = current_formula_id(database)
    formula_index = 0
    for comment, body in zip(comments["CommentId"], comments["Text"]):
        if site_name not in DumpDownloader.special_delim:
            formulas, positions, inline, error = formula_extr(body, site_name)
        else:
            formulas, positions, inline, error = formula_extr_special(body, DumpDownloader.special_delim[site_name])
        if not error:
            for formula, position, inl in zip(formulas, positions, inline):
                Formulas["FormulaId"].append(starting_formula_index+formula_index)
                Formulas["Site"].append(site_name)
                Formulas["CommentId"].append(comment)
                Formulas["LaTeXBody"].append(formula)
                #Formulas["SLTBody"].append(get_mathml(formula))
                Formulas["TokenLength"].append(formula_token_length(formula))
                Formulas["StartingPosition"].append(position)
                Formulas["Inline"].append(inl)
                #formula_con[starting_formula_index+formula_index] = [int(comment), formula, position, inl]
                formula_index += 1
        else:
            error_count += 1

        if(len(Formulas["FormulaId"])>1000000):
            df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"],  "CommentId":Formulas["CommentId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]})
            write_table(database, 'FormulasComments', df, "append")
            Formulas = {"FormulaId": [], "Site": [], "CommentId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []}
            df._clear_item_cache()

    df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "CommentId":Formulas["CommentId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]})
    write_table(database, 'FormulasComments', df)

    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    log(statistics_file, str(formula_index) + " formulas parsed from comments")
    log(statistics_file, str(error_count) + " errors in parsing comment formulas")
    log(statistics_file, "error rate parsing formulas from comments: " + format(error_count/(len(comments["CommentId"]))*100, ".4f") + " %")
예제 #8
0
def questions_formula_processing(site_name, database, directory, context_length):
    DB = sqlite3.connect(database)
    questions = pd.read_sql('select * from "QuestionText" where Site="'+site_name+'"', DB)
    DB.close()

    Formulas = {"FormulaId": [], "Site": [], "PostId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []}
    #formula_con={}
    error_count = 0
    starting_formula_index = current_formula_id(database)
    formula_index = 0

    # question processing (title and body)
    for question, title, body in zip(questions["QuestionId"], questions["Title"], questions["Body"]):
        if site_name not in DumpDownloader.special_delim:
            formulas_title, positions_title, _, error_title = formula_extr(title, site_name)
            formulas_body, positions_body, inline, error_body = formula_extr(body, site_name)
        else:
            formulas_title, positions_title, _, error_title = formula_extr_special(title, DumpDownloader.special_delim[site_name])
            formulas_body, positions_body, inline, error_body = formula_extr_special(body, DumpDownloader.special_delim[site_name])

        # parsing errors occur (total of ~6500) do not take formulas from "invalid" texts
        if not error_title and not error_body:
            for formula, position in zip(formulas_title, positions_title):
                Formulas["FormulaId"].append(starting_formula_index+formula_index)
                Formulas["Site"].append(site_name)
                Formulas["PostId"].append(int(question))
                Formulas["LaTeXBody"].append(formula)
                #Formulas["SLTBody"].append(get_mathml(formula))
                Formulas["TokenLength"].append(formula_token_length(formula))
                # position -1 for formulas in title
                Formulas["StartingPosition"].append(-1)
                Formulas["Inline"].append(True)
                #formula_con[starting_formula_index+formula_index] = [int(question), formula, position, inl]
                formula_index += 1
            for formula, position, inl in zip(formulas_body, positions_body, inline):
                Formulas["FormulaId"].append(starting_formula_index+formula_index)
                Formulas["Site"].append(site_name)
                Formulas["PostId"].append(int(question))
                Formulas["LaTeXBody"].append(formula)
                #Formulas["SLTBody"].append(get_mathml(formula))
                Formulas["TokenLength"].append(formula_token_length(formula))
                Formulas["StartingPosition"].append(position)
                Formulas["Inline"].append(inl)
                #formula_con[starting_formula_index+formula_index] = [int(question), formula, position, inl]
                formula_index += 1
        else:
            error_count += 1

        if(len(Formulas["FormulaId"])>1000000):
            df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "PostId":Formulas["PostId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]})
            write_table(database, 'FormulasPosts', df)
            Formulas = {"FormulaId": [], "Site": [], "PostId": [], "LaTeXBody":[], "TokenLength": [], "StartingPosition": [], "Inline": []}
            df._clear_item_cache()

    df = pd.DataFrame({"FormulaId":Formulas["FormulaId"], "Site": Formulas["Site"], "PostId":Formulas["PostId"],"LaTeXBody":Formulas["LaTeXBody"], "TokenLength":Formulas["TokenLength"], "StartingPosition":Formulas["StartingPosition"], "Inline":Formulas["Inline"]})
    write_table(database, 'FormulasPosts', df)

    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    log(statistics_file, str(formula_index) + " formulas parsed from questions")
    log(statistics_file, str(error_count) + " errors in parsing question formulas")
    log(statistics_file, "error rate parsing formulas from questions: " + format(error_count/(len(questions["QuestionId"]))*100, ".4f") + " %")
예제 #9
0
def context_main(filename_dumps, dump_directory, database, x, n, corpus, tablename, tfidf, all, stopwords):
    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    start = time.time()
    with open(filename_dumps) as f:
        sites = [line.rstrip() for line in f if line is not ""]

    log(statistics_file, "#################################################")
    log(statistics_file, "context.py")
    log(statistics_file, "input: " + dump_directory + ", x - " + str(x) + ", n - "+ str(n) + ", corpus - " + corpus + ", tfidf - " + tfidf + ", all - "+ all)
    log(statistics_file, "output: "+ database + " - Table " + tablename + ", " + statistics_file)
    log(statistics_file, "dumps: " + str(sites))
    log(statistics_file, "-------------------------")

    downloader = DumpDownloader()
    directories = [os.path.join(dump_directory, downloader.get_file_name(site)).replace(".7z", "/") for site in sites]

    if all == 'yes':
        all = True
    else:
        all = False
    if tfidf== 'yes':
        tfidf = True
    else:
        tfidf=False
    if stopwords == 'none':
        stopwords = None
    elif stopwords == 'english':
        stopwords = 'english'
    else:
        try:
            with open(stopwords) as f:
                stopwords = [line.rstrip() for line in f if line is not ""]
            print("Stopwords: " + stopwords.__str__())
        except Exception as e:
            print("File " + stopwords + " not found or error while parsing. Using no stopwords")
            stopwords = None


    if not all:
        try:
            if not ((corpus == "all") | (corpus == "individual")):
                raise ValueError
        except ValueError:
            sys.exit("option for --corpus must be 'all' or 'individual'")
        if corpus == "all":
            print("Calculating idf values of all sites texts")
            t1 = time.time()
            bow = calculate_idf(sites, directories, database, stopwords)
            log(statistics_file, "time calculating idf scores: "+ str(int((time.time()-t1)/60)) +"min " + str(int((time.time()-t1)%60)) + "sec")


    if_exists = "replace"
    for site, directory in zip(sites, directories):
        try:
            if not os.path.exists(directory):
                raise OSError
        except OSError:
            print(directory + " not found")

        if (corpus == "individual") and not all:
            print("Calculating idf values of texts of site "+ site)
            t1 = time.time()
            bow = calculate_idf([site], directories, database, stopwords)
            log(statistics_file, "time calculating idf scores: "+ str(int((time.time()-t1)/60)) +"min " + str(int((time.time()-t1)%60)) + "sec")


        #   for each formula
        #    get context
        #    calculate top n context
        #   save in table in database (option for other table name)
        contexts, docs = posts_context(directory, database, site, x, all)
        t1 = time.time()
        if all:
            top_n_contexts = {}
            for postid in contexts:
                for id, context in contexts[postid].items():
                    top_n_contexts[id] = " ".join(context)
        else:
            top_n_contexts = bow.get_top_n_tfidf2(contexts, docs, n, tfidf, all)
        log(statistics_file, "time for contexts posts: "+ str(int((time.time()-t1)/60)) +"min " + str(int((time.time()-t1)%60)) + "sec")

        write_context_table(site, top_n_contexts, database, tablename, if_exists)

        if_exists = "append"

        contexts, docs = comments_context(directory, database, site, x, all)
        t1 = time.time()
        if all:
            top_n_contexts = {}
            for commentid in contexts:
                for id, context in contexts[commentid].items():
                    top_n_contexts[id] = " ".join(context)
        else:
            top_n_contexts = bow.get_top_n_tfidf2(contexts, docs, n, tfidf, all)
        log(statistics_file, "time for contexts comments: "+ str(int((time.time()-t1)/60)) +"min " + str(int((time.time()-t1)%60)) + "sec")
        write_context_table(site, top_n_contexts, database, tablename, if_exists)

    log(statistics_file, "-------------------------")
    log(statistics_file, "total execution time: "+ str(int((time.time()-start)/60)) +"min " + str(int((time.time()-start)%60)) + "sec")
    log(statistics_file, "max memory usage: " + format((resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)/pow(2,30), ".3f")+ " GigaByte")
    log(statistics_file, "#################################################")
예제 #10
0
def main(filename_dumps, database, directory):
    statistics_file = os.path.join(directory, "statistics.log")

    with open(filename_dumps) as f:
        sites = [line.rstrip() for line in f if line is not ""]

    start = time.time()
    log(statistics_file, "#################################################")
    log(statistics_file, "statistics.py")
    log(statistics_file, "input: " + database)
    log(statistics_file, "output: " + directory + ", " + statistics_file)
    log(statistics_file, "dumps: " + str(sites))
    log(statistics_file, "-------------------------")

    # set plot style (seaborn)
    sns.set_theme(context="paper")

    for site in sites:
        DB = sqlite3.connect(database)
        formulas_posts = pd.read_sql(
            'select FormulaId, PostId, TokenLength from "FormulasPosts" where Site="'
            + site + '"', DB)
        question_ids = pd.read_sql(
            'select QuestionId from "QuestionTags" where Site="' + site + '"',
            DB)
        answer_ids = pd.read_sql(
            'select AnswerId from "AnswerMeta" where Site="' + site + '"', DB)
        DB.close()
        post_ids = list(question_ids["QuestionId"]) + list(
            answer_ids["AnswerId"])
        question_ids.pop("QuestionId")
        answer_ids.pop("AnswerId")

        figure_file_p, df_stats_p = formulas_per_post(
            dict(zip(formulas_posts["FormulaId"],
                     formulas_posts["PostId"])), post_ids,
            list(formulas_posts["TokenLength"]), site, directory, "post")
        formulas_posts.pop("FormulaId")
        formulas_posts.pop("PostId")
        #print("max memory usage: " + format((resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)/pow(2,30), ".3f")+ " GigaByte")

        DB = sqlite3.connect(database)
        question_texts = pd.read_sql(
            'select Title, Body from "QuestionText" where Site="' + site + '"',
            DB)
        answer_texts = pd.read_sql(
            'select Body from "AnswerText" where Site="' + site + '"', DB)
        DB.close()

        df_words = common_words(
            list(question_texts["Title"]) + list(question_texts["Body"]) +
            list(answer_texts["Body"]), 100)
        question_texts.pop("Title")
        question_texts.pop("Body")
        answer_texts.pop("Body")
        #print("max memory usage: " + format((resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)/pow(2,30), ".3f")+ " GigaByte")

        DB = sqlite3.connect(database)
        formulas_comments = pd.read_sql(
            'select FormulaId, CommentId, TokenLength from "FormulasComments" where Site="'
            + site + '"', DB)
        comment_ids = pd.read_sql(
            'select CommentId from "Comments" where Site="' + site + '"', DB)
        DB.close()

        figure_file_c, df_stats_c = formulas_per_post(
            dict(
                zip(formulas_comments["FormulaId"],
                    formulas_comments["CommentId"])),
            list(comment_ids["CommentId"]),
            list(formulas_comments["TokenLength"]), site, directory, "comment")
        comment_ids.pop("CommentId")
        formulas_comments.pop("FormulaId")
        formulas_comments.pop("CommentId")

        DB = sqlite3.connect(database)
        formulas_posts = pd.read_sql(
            'select LaTeXBody from "FormulasPosts" where Site="' + site + '"',
            DB)
        formulas_comments = pd.read_sql(
            'select LaTeXBody from "FormulasComments" where Site="' + site +
            '"', DB)
        DB.close()

        all_formulas = list(formulas_posts["LaTeXBody"]) + list(
            formulas_comments["LaTeXBody"])
        formulas_posts.pop("LaTeXBody")
        formulas_comments.pop("LaTeXBody")
        df_tokens = common_tokens(all_tokens(all_formulas), 100)
        all_formulas = []

        DB = sqlite3.connect(database)
        formulas_posts = pd.read_sql(
            'select LaTeXBody from "FormulasPosts" where Site="' + site +
            '" and TokenLength>"1"', DB)
        formulas_comments = pd.read_sql(
            'select LaTeXBody from "FormulasComments" where Site="' + site +
            '" and TokenLength>"1"', DB)
        DB.close()

        all_formulas = list(formulas_posts["LaTeXBody"]) + list(
            formulas_comments["LaTeXBody"])
        formulas_posts.pop("LaTeXBody")
        formulas_comments.pop("LaTeXBody")

        df_duplicates = duplicate_formulas(all_formulas, 100)
        print("max memory usage: " +
              format((resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) /
                     pow(2, 30), ".3f") + " GigaByte")

        DB = sqlite3.connect(database)
        tags = pd.read_sql(
            'select Tag, Count from "Tags" where Site="' + site +
            '" ORDER BY Count DESC limit 100', DB)
        DB.close()
        tags_histo_all(
            tags, "200 Most Frequent Tags Distribution in '" + site + "'",
            os.path.join(directory, "diagrams", site + "_tags_desc.png"))

        DB = sqlite3.connect(database)
        top_tags = pd.read_sql(
            'select Tag, Count from "Tags" where Site="' + site +
            '" ORDER BY Count DESC limit 20', DB)
        DB.close()
        tags_histo(top_tags, "20 Most Frequent Tags in '" + site + "'",
                   os.path.join(directory, "diagrams", site + "_top_tags.png"))

        DB = sqlite3.connect(database)
        bottom_tags = pd.read_sql(
            'select Tag, Count from "Tags" where Site="' + site +
            '" AND Count > 3 ORDER BY Count ASC limit 20', DB)
        DB.close()
        tags_histo(
            bottom_tags, "20 Least Frequent Tags in '" + site + "'",
            os.path.join(directory, "diagrams", site + "_bottom_tags.png"))

        save_to_html(figure_file_p, figure_file_c, df_tokens, df_words,
                     df_stats_p, df_stats_c, df_duplicates, directory, site)

    log(statistics_file, "-------------------------")
    log(
        statistics_file,
        "total execution time: " + str(int((time.time() - start) / 60)) +
        "min " + str(int((time.time() - start) % 60)) + "sec")
    log(
        statistics_file, "max memory usage: " + format(
            (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) / pow(2, 30),
            ".3f") + " GigaByte")
    log(statistics_file, "#################################################")
예제 #11
0
def formulas_per_post(formulaid_postid, all_postids, token_lengths, site,
                      directory, text_type):
    posts = len(all_postids)
    formulas = len(formulaid_postid)
    stats_titles = []
    stats_values = []

    stats_titles.append("total " + text_type + "s: ")
    stats_values.append(format(posts, ',d'))

    stats_titles.append("total formulas in " + text_type + "s: ")
    stats_values.append(format(formulas, ',d'))

    stats_titles.append("average number of formulas per " + text_type + ": ")
    stats_values.append(format(formulas / posts, ".2f"))

    # percentage of posts with at least 1 formula
    unique_postids = len(set(formulaid_postid.values()))
    stats_titles.append("total number of " + text_type +
                        "s containing formulas: ")
    stats_values.append(format(unique_postids, ',d'))
    stats_titles.append("percentage of " + text_type +
                        "s containing formulas: ")
    stats_values.append(format(100 * unique_postids / posts, ".2f"))

    # number of formulas in post {postid: #formulas}
    counter = Counter(formulaid_postid.values())

    # number of posts with x formulas {x: #posts}
    counts_counter = Counter(counter.values())
    # manually add number of posts with 0 formulas
    counts_counter[0] = posts - unique_postids

    rev_ordered_counts_counter = collections.OrderedDict(
        sorted(counts_counter.items(), reverse=True))

    to_remove = 0.02 * posts
    removed = 0
    top = []
    for k, v in rev_ordered_counts_counter.items():
        if k < 20:
            break
        if removed <= to_remove:
            top.append(counts_counter.pop(k))
            removed += v
        else:
            break

    prev = 0
    for k in sorted(counts_counter.keys()):
        while k > prev + 1:
            prev += 1
            counts_counter[prev] = 0
        prev = k

    fig, (ax1, ax2) = plt.subplots(2, 1)
    # make this into a histogram of number of formula distribution in questions, answers, posts and comments
    #plt.subplot(2, 1, 1)
    #fig.subplots_adjust(left=0.2, hspace=0.55, wspace=0.3)
    o_counts_counter = OrderedDict(sorted(counts_counter.items()))
    labels = [str(k) for k in o_counts_counter.keys()] + ["x"]
    o_counts_counter[len(o_counts_counter)] = removed

    ax1.bar(labels,
            o_counts_counter.values(),
            color='darkgreen',
            edgecolor='black',
            linewidth=1)
    ax1.set_title("Formula Distribution of '" + site + "' in " +
                  text_type.title() + "s")
    ax1.set_xlabel("Number of Formulas per " + text_type.title())
    ax1.set_ylabel("Number of " + text_type.title() + "s")
    plt.sca(ax1)
    labels.remove("x")
    maximum = labels[len(labels) - 1]
    if int(maximum) < 20:
        locations = reduce_labels(labels)
        labels = locations
    else:
        locations = reduce_labels(labels)
        if (locations[-1] == labels[-1]) & (len(locations) > 5):
            locations.remove(labels[-1])
            labels.remove(labels[-1])
        locations.append("x")
        labels = reduce_labels(labels)
        labels.append(r'$\geq$' + str(int(maximum) + 1))
    plt.xticks(locations, labels)
    plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))

    counter = Counter(sorted(token_lengths))
    counter[0] = 0
    removed = 0

    rev_ordered_counter = collections.OrderedDict(
        sorted(counter.items(), reverse=True))

    to_remove = 0.05 * len(token_lengths)
    top = []
    for k, v in rev_ordered_counter.items():
        if k < 20:
            break
        if removed <= to_remove:
            top.append(counter.pop(k))
            removed += v
        else:
            break

    prev = 0
    for k in sorted(counter.keys()):
        while k > prev + 1:
            prev += 1
            counter[prev] = 0
        prev = k

    ordered_counter = collections.OrderedDict(sorted(counter.items()))
    labels = [str(k) for k in ordered_counter.keys()] + ["x"]

    ordered_counter[max(ordered_counter.keys()) + 1] = removed
    ax2.bar(labels,
            ordered_counter.values(),
            color='darkgreen',
            edgecolor='black',
            linewidth=1)
    ax2.set_title("Formula Length Distribution of " + text_type.title() +
                  "s in '" + site + "'")
    ax2.set_xlabel("Number of Tokens per Formula")
    ax2.set_ylabel("Number of Formulas")
    plt.sca(ax2)
    labels.remove("x")
    maximum = labels[len(labels) - 1]
    locations = reduce_labels(labels)
    if (locations[-1] == labels[-1]) & (len(locations) > 5):
        locations.remove(labels[-1])
        labels.remove(labels[-1])
    locations.append("x")
    labels = reduce_labels(labels)
    labels.append(r'$\geq$' + str(int(maximum) + 1))
    plt.xticks(locations, labels)
    plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))

    fig.tight_layout()
    file = os.path.join(directory, "diagrams",
                        site + "_" + text_type + "_stats.png")
    fig.savefig(file, dpi=400)

    log("../output/statistics.log", "Figure saved to " + file)
    return os.path.join("diagrams",
                        site + "_" + text_type + "_stats.png"), pd.DataFrame({
                            "Title":
                            stats_titles,
                            "Value":
                            stats_values
                        })
예제 #12
0
def main(dump_directory, filename_dumps, download, extract, database,
         force_process):
    if not os.path.isdir(os.path.dirname(database)):
        os.mkdir(os.path.dirname(database))
    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    start = time.time()
    log(statistics_file, "#################################################")
    log(statistics_file, "main.py")
    log(statistics_file, "input: " + dump_directory)
    log(statistics_file, "output: " + database + statistics_file)
    log(statistics_file, "dumps: " + filename_dumps)
    log(statistics_file, "-------------------------")

    sites, directories, files = dumps(dump_directory, filename_dumps, download,
                                      extract)

    dump_processing.database.create_tables(database)

    DB = sqlite3.connect(database)
    sites_hashs = pd.read_sql('select * from "SiteFileHash"', DB)
    DB.close()

    bag_of_words = BOW()
    first = True

    for site, dir, file in zip(sites, directories, files):
        log(statistics_file, "Processing site " + site)
        if extract == "yes":
            with open(file, 'rb') as f:
                hasher = hashlib.md5()
                for chunk in iter(lambda: f.read(128 * hasher.block_size),
                                  b''):
                    hasher.update(chunk)
                hash = hasher.hexdigest()
            exists = sites_hashs[sites_hashs["Site"] == site].any()[0]
            if exists:
                old_hash = sites_hashs["MD5Hash"][sites_hashs[
                    sites_hashs["Site"] == site].index.values[0]]
            else:
                old_hash = ""
            if (hash != old_hash) | (force_process == "yes"):
                dump_processing.database.remove_site(site, database)
                dump_processing.process_dump.processing_main(
                    site, dir, database, 7)
                save_hash(database, site, hash, exists)
        else:
            dump_processing.database.remove_site(site, database)
            dump_processing.process_dump.processing_main(
                site, dir, database, 7)

    # TODO: highlighted, bold etc words

    log(statistics_file, "-------------------------")
    log(
        statistics_file,
        "total execution time: " + str(int((time.time() - start) / 60)) +
        "min " + str(int((time.time() - start) % 60)) + "sec")
    log(
        statistics_file, "max memory usage: " + format(
            (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) / pow(2, 30),
            ".3f") + " GigaByte")
    log(statistics_file, "#################################################")
예제 #13
0
def main(filename_dumps, database, mode, threads, tree, comments, offset,
         total_formulas, output_database):
    statistics_file = os.path.join(
        Path(output_database).parent, "parse_formulas.log")
    if not os.path.exists(statistics_file):
        with open(statistics_file, 'w'):
            pass
    start = time.time()
    log(statistics_file, "#################################################")
    log(statistics_file, "parse_formulas.py")
    log(statistics_file,
        "input: " + database + ", mode: " + mode + ", " + threads + " threads")
    log(statistics_file, "output: " + output_database + ", " + statistics_file)

    with open(filename_dumps) as f:
        sites = [line.rstrip() for line in f if line != ""]

    log(statistics_file, "dumps: " + str(sites))
    log(statistics_file, "-------------------------")

    try:
        threads = int(threads)
    except:
        print("An Error occured parsing --threads argument " + threads)
    create_mathml_tables(output_database)

    if tree == "yes":
        tree = True
    else:
        tree = False

    if comments == "yes":
        comments = True
    else:
        comments = False

    for site in sites:
        start = time.time()
        if mode == "cmml":
            formulas_to_cmml(database, "FormulasPosts", site, threads, tree,
                             offset, total_formulas, output_database)
            sys.stdout.write('\n')
            if (comments):
                formulas_to_cmml(database, "FormulasComments", site, threads,
                                 tree, offset, total_formulas, output_database)
        if mode == "pmml":
            formulas_to_pmml(database, "FormulasPosts", site, threads, tree,
                             offset, total_formulas, output_database)
            sys.stdout.write('\n')
            if (comments):
                formulas_to_pmml(database, "FormulasComments", site, threads,
                                 tree, offset, total_formulas, output_database)
        if mode == "both":
            formulas_to_both_ml(database, "FormulasPosts", site, threads, tree,
                                offset, total_formulas, output_database)
            sys.stdout.write('\n')
            if (comments):
                formulas_to_both_ml(database, "FormulasComments", site,
                                    threads, tree, offset, total_formulas,
                                    output_database)
        sys.stdout.write('\n' + site + ' finished. Time: ' +
                         str(int((time.time() - start) / 60)) + "min " +
                         str(int((time.time() - start) % 60)) + "sec")

    log(statistics_file, "\n-------------------------")
    log(
        statistics_file,
        "total execution time: " + str(int((time.time() - start) / 60)) +
        "min " + str(int((time.time() - start) % 60)) + "sec")
    log(
        statistics_file, "max memory usage: " + format(
            (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) / pow(2, 30),
            ".3f") + " GigaByte")
    log(statistics_file, "#################################################")
def comments_processing(site_name, directory, database):
    comments = {
        "Site": [],
        "CommentId": [],
        "PostId": [],
        "UserId": [],
        "Score": [],
        "Text": [],
        "CreationDate": []
    }
    comment_index = 0

    comments_dict = {}

    for event, elem in ET.iterparse(os.path.join(directory, "Comments.xml")):
        if event == "end":
            try:
                postid = int(elem.attrib["PostId"])
                userid = int(elem.attrib["UserId"])
                score = int(elem.attrib["Score"])
                creationdate = elem.attrib["CreationDate"]
                text = elem.attrib["Text"]

                comments["Site"].append(site_name)
                comments["CommentId"].append(comment_index)
                comments["PostId"].append(postid)
                comments["UserId"].append(userid)
                comments["Score"].append(score)
                comments["CreationDate"].append(creationdate)
                comments["Text"].append(text)
                elem.clear()

                #comments_dict[comment_index] = text

                comment_index += 1
            except Exception as e:
                pass
        if (len(comments["CommentId"]) > 1000000):
            df = pd.DataFrame({
                "Site": comments["Site"],
                "CommentId": comments["CommentId"],
                "PostId": comments["PostId"],
                "UserId": comments["UserId"],
                "Score": comments["Score"],
                "Text": comments["Text"],
                "CreationDate": comments["CreationDate"]
            })
            write_table(database, 'Comments', df)
            comments = {
                "Site": [],
                "CommentId": [],
                "PostId": [],
                "UserId": [],
                "Score": [],
                "Text": [],
                "CreationDate": []
            }

    df = pd.DataFrame({
        "Site": comments["Site"],
        "CommentId": comments["CommentId"],
        "PostId": comments["PostId"],
        "UserId": comments["UserId"],
        "Score": comments["Score"],
        "Text": comments["Text"],
        "CreationDate": comments["CreationDate"]
    })
    write_table(database, 'Comments', df)
    statistics_file = os.path.join(Path(database).parent, "statistics.log")
    log(statistics_file, "# comments: " + str(len(df)))