示例#1
0
def check_output(name):
    data = csvReader.readfile(name)
    print(data[0])
    print(data[0]["yaml"])
    print(type(data[0]["yaml"]))
    for line in data:
        if line.get("yaml") == "False":
            print(line)
            break
示例#2
0
def spread_of_data_line_star_other_paper():
    data = csvReader.readfile("breadth_corpus.csv")
    results = []
    for line in data:
        if line.get("CI") and int(line.get("CI")) > 0:
            results.append((int(line.get("stars")), 1))
        else:
            results.append((int(line.get("stars")), 0))

    return create_percentage_bar_graphs(results, "percentage of stars that use CI", "stars")
def check(filename):
    if filename is None:
        return 0
    lines = csvReader.readfile(filename)
    if len(lines) == 0:
        keys_length = 0
    else:
        keys_length = len(lines[0].keys())
    readme = 0
    jenkins = 0
    for line in lines:
        if line.get("readme") is not None:
            readme += 1

        if line.get("jenkinsPipeline0"):
            jenkins += 1

    print("filename: {}{}{}{}{}".format(filename, pad(keys_length, 20),
                                        pad(len(lines), 20), pad(readme),
                                        pad(jenkins)))
    return len(lines)
def merge(mypath, save=True, query="raptor"):
    onlyfiles = [
        f for f in listdir(mypath)
        if isfile(join(mypath, f)) and f.endswith(".csv") and query in f
    ]
    combined = {}
    duplicates = 0
    forks = 0
    for i in range(len(onlyfiles)):
        tempfiles = csvReader.readfile(join(mypath, onlyfiles[i]))
        for line in tempfiles:
            for k in csvReader.KEYS_TO_TRY:
                # if there is no config
                if line.get(k) is None:
                    line[k] = ""

            # watchers not working
            if line.get("watchers") is None:
                line["watchers"] = 0

            # duplicates numbers
            if combined.get(line.get("id")) is not None:
                duplicates += 1
            if line.get("fork") is not None and isinstance(
                    line.get("fork"), bool) and line.get("fork"):
                forks += 1
            combined[line.get("id")] = line

    print("duplicates: ", duplicates)
    print("forks: ", forks)
    print("results: ", len(combined.values()))
    if save:
        name = csvReader.check_name("combined")
        if name:
            csvReader.writeToCsv(list(combined.values()), name, fields=FIELDS)
        else:
            print("too many combined copies already found")
        return f"{name}.csv"
    return None
示例#5
0
def main(experimenting, name1, name2, image_encoding, output="."):
    if experimenting:
        # data = csvReader.readfile("combined1.csv")
        # sorted_data = csvReader.readfile("yaml threaded6.csv")
        # spread_of_data_v2(data, sorted_data).show()
        # plt.clf()
        # spread_of_data_line_sub(data, sorted_data).show()
        # save_as_pdf(language_type(load_dataframe(name1), sorted_data), f"{output}/languages", image_encoding)
        # languages_table_topn(f"{output}/languages table.tex", 20, load_dataframe(name1), sorted_data)
        # langs = languages_table_topn(f"{output}/languages table.tex", 30, load_dataframe(name1), sorted_data)
        # save_as_pdf(popularity_vs_percentage_CI_scatter(langs, sorted_data), f"{output}/languages-scatter-CI",
        #             image_encoding)
        # save_as_pdf(config_topn(sorted_data, 20), f"{output}/config-topn", image_encoding)
        # langs = languages_table_topn(f"{output}/languages table.tex", 30, load_dataframe(name1), sorted_data)
        # save_as_pdf(popularity_vs_percentage_CI_scatter(langs, sorted_data), f"{output}/languages-scatter-CI",
        #             image_encoding)
        # code_with_comments(sorted_data, "code", "comments")
        # code_with_comments(sorted_data, "code", "file_lines")
        # code_with_comments(sorted_data, "code", "blank_lines")
        # code_with_comments(sorted_data, "single_line_comment", "multi_line_comment")
        # code_with_comments(sorted_data, "multi_line_comment", "multi_line_comment_unique")
        #
        # code_with_comments(sorted_data, "code", "comments", "lang")
        # code_with_comments(sorted_data, "code", "file_lines", "lang")

        # save_as_pdf(line_usage_configuration(sorted_data[sorted_data["yaml"]]).show()
        # save_as_pdf(line_usage_configuration2(sorted_data[sorted_data["yaml"]]).show()
        # save_as_pdf(line_usage_configuration2(sorted_data[sorted_data["yaml"] == False]).show() asfdasdfasdfafdasdfasdfasdfasdfsdf
        # comment_uage_tbale(sorted_data)

        # save_as_pdf(line_usage_configuration(sorted_data), f"{output}/line structure all", image_encoding)
        # save_as_pdf(line_usage_configuration(sorted_data[sorted_data["yaml"]]), f"{output}/line structure yaml",
        #             image_encoding)
        # save_as_pdf(line_usage_configuration2(sorted_data[sorted_data["yaml"]]),
        #             f"{output}/line structure yaml comments", image_encoding)
        # save_as_pdf(line_usage_configuration2(sorted_data[sorted_data["yaml"] == False]),
        #             f"{output}/line structure none yaml comments", image_encoding)
        #
        # # data = csvReader.readfile(name1)
        # # #
        # # save_as_pdf(spread_of_data_sub_to_stars(data), f"{output}/sub vs stars", image_encoding)
        # scripts_latex(f"{output}/scripts table new.tex", sorted_data)
        # save_as_pdf(script_usage(sorted_data), f"{output}/scripts usage bars", image_encoding)

        # save_as_pdf(lines_against_scripts(sorted_data[sorted_data["yaml"]]), f"{output}/scripts vs lines", image_encoding)
        # save_as_pdf(stars_against_lines(sorted_data), f"{output}/scripts vs stars", image_encoding)
        sorted_data = load_dataframe(name2)
        data = pd.read_csv("2020 combined.csv", dtype={"id": int, "language": str}, parse_dates=[
                           "recent_commit1", "recent_commit2", "recent_commit3"])
        # get_last_years_ci_usage(sorted_data, data, None, None)
        # get_last_years_ci_usage(sorted_data, data, "lang","language", "Rust")
        # get_last_years_ci_usage(sorted_data, data, "lang","language", "JavaScript")
        # get_last_years_ci_usage(sorted_data, data, "lang","language", "Go")
        # get_last_years_ci_usage(sorted_data, data, "lang","language", "Python")
        # get_last_years_ci_usage(sorted_data, data, "lang","language", "C++")
        # get_last_years_ci_usage(sorted_data, data, "lang","language", "Java")

        get_last_years_ci_usage(sorted_data, data, "lang", "language")

        # recent_commit1 = get_last_timeframe(data, "recent_commit1", "12M")

        # print("There are {} repositories that had their last commit in the last year. This is {} percentage of the total sample size.".format(len(recent_commit1), len(recent_commit1)/len(data)*100))
        # print("Out of all the last commits {} had CI".format(len(commit1)/len(recent_commit1)*100))

        # ci_two_years = len(get_last_timeframe(sorted_data, "commit_1", "2Y")) - len(commit1)
        # total_two_years = len(get_last_timeframe(data, "recent_commit1", "2Y")) - len(recent_commit1)

        # print("Out of all the last commits {} had CI".format(ci_two_years/total_two_years*100))

        # # # save_as_pdf(ci_usage_against_commits(data, sorted_data), f"{output}/cats", image_encoding)

        # # RQ3
        # langs = languages_table_topn(f"{output}/2019 languages table.tex", 30, load_dataframe(name1), sorted_data)
        # save_as_pdf(popularity_vs_percentage_CI_scatter(langs, sorted_data), f"{output}/2019-languages-scatter-CI",
        #             image_encoding)

    else:
        data = csvReader.readfile(name1)
        # #
        save_as_pdf(spread_of_data_sub_to_stars(data),
                    f"{output}/sub vs stars", image_encoding)
        save_as_pdf(spread_over_time_stars(data),
                    f"{output}/spread over time", image_encoding)
        save_as_pdf(spread_data_issues_vs_stars(data),
                    f"{output}/issues vs stars", image_encoding)

        sorted_data = load_dataframe(name2)
        yaml_config_errors_to_latex(
            f"{output}/yaml config errors.tex", sorted_data)
        # should not need to rerun this unless more scraping is done!!!
        # commented out as manual edits to the formatting are easier than code ones atm
        # config_type_split(f"{output}/configuration type count.tex", sorted_data)

        save_as_pdf(config_topn(sorted_data, 20),
                    f"{output}/config-topn", image_encoding)

        # RQ3
        langs = languages_table_topn(
            f"{output}/languages table.tex",
            30,
            pd.read_csv(name1,
                        dtype=dtypes,
                        parse_dates=["recent_commit1", "recent_commit2", "recent_commit3"]),
            sorted_data)
        save_as_pdf(popularity_vs_percentage_CI_scatter(langs, sorted_data), f"{output}/languages-scatter-CI",
                    image_encoding)

        save_as_pdf(langues_topn(sorted_data, 30),
                    f"{output}/languages-topn", image_encoding)

        save_as_pdf(language_type(pd.read_csv(name1, dtype=dtypes,parse_dates=[ "recent_commit1", "recent_commit2", "recent_commit3"]), sorted_data), f"{output}/languages", image_encoding)

        # --------------
        sorted_data_csv = csvReader.readfile(name2)
        save_as_pdf(spread_of_data_line_star(data, sorted_data_csv), f"{output}/percentage stars with CI",
                    image_encoding)
        save_as_pdf(spread_of_data_line_sub(data, sorted_data_csv),
                    f"{output}/percentage sub with CI", image_encoding)
        save_as_pdf(spread_of_data_line_star_other_paper(), f"{output}/percentage sub with CI other paper source",
                    image_encoding)

        # render_sankey_diagram.save_sanky_daigram_for_errors_and_comments(f"./{output}/sankey",
        #                                                                  pd.read_csv(name2, dtype=dtypes), False, False,
        #                                                                  image_encoding)
        # render_sankey_diagram.save_sanky_daigram_for_errors_and_comments(f"./{output}/sankey2",
        #                                                                  pd.read_csv(name2, dtype=dtypes), False, True,
        #                                                                  image_encoding)
        # render_sankey_diagram.save_sanky_daigram_for_errors_and_comments(f"./{output}/sankey3",
        #                                                                  pd.read_csv(name2, dtype=dtypes), True, False,

        save_as_pdf(line_usage_configuration(sorted_data),
                    f"{output}/line structure all", image_encoding)
        save_as_pdf(line_usage_configuration(sorted_data[sorted_data["yaml"]]), f"{output}/line structure yaml",
                    image_encoding)
        save_as_pdf(line_usage_configuration2(sorted_data[sorted_data["yaml"]]),
                    f"{output}/line structure yaml comments", image_encoding)
        save_as_pdf(line_usage_configuration2(sorted_data[sorted_data["yaml"] == False]),
                    f"{output}/line structure none yaml comments", image_encoding)

        save_as_pdf(comment_usage(sorted_data),
                    f"{output}/comments usage bars", image_encoding)

        save_as_pdf(script_usage(sorted_data),
                    f"{output}/scripts usage bars", image_encoding)
        scripts_latex(f"{output}/scripts table new.tex", sorted_data)
        save_as_pdf(lines_against_scripts(sorted_data),
                    f"{output}/scripts vs lines", image_encoding)
        save_as_pdf(stars_against_lines(sorted_data),
                    f"{output}/scripts vs stars", image_encoding)

    return sorted_data
示例#6
0
    name = csvReader.check_name(name, limit=20)

    if name == "":
        print(
            "file already found for the files for the main file so can't write to disk"
        )
        return

    print(f"writing to {name}.csv")

    with open(f"{name}.csv", "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile,
                                fieldnames=FIELDS,
                                quoting=csv.QUOTE_MINIMAL)
        writer.writeheader()

    run_main(num_worker_threads, data, name)
    write_to_latex(f"{output_for_latex}/generated_table2.tex", len(data), name)
    return f"{name}.csv"


if __name__ == '__main__':
    main("2020 yaml threaded", csvReader.readfile("2020 combined.csv"),
         "./results")
    print(len(results))
    print(len(results_ci))
    print(no_readme)

    # check_output("yaml threaded5.csv")
示例#7
0
name2 = ""
user_input = ""

print("config check:{} merge:{} parse:{} render:{}".format(
    CHECK, MERGE, PARSE, RENDER))
s = time.time()
if CHECK:
    checker.merge("./newData", query="socket", save=False)
    user_input = input("do you want to save the merge?").replace(" ", "")
    while user_input not in ["yes", "y", "no", "n"]:
        user_input = input("do you want to save the merge?").replace(" ", "")

if (MERGE and user_input == "") or user_input in ["yes", "y"]:
    print("running merge")
    name1 = checker.merge("./newData", query="socket", save=True)
    name2 = data_parser.main(name2_base_name, csvReader.readfile(name1),
                             OUTPUT_RESULTS_PATH)

if PARSE:
    print("parsing data")
    if name1 == "":
        name1 = csvReader.get_latest_name(name1_base_name)
    name2 = data_parser.main(name2_base_name, csvReader.readfile(name1),
                             OUTPUT_RESULTS_PATH)

if RENDER:
    print("rendering")
    if name1 == "":
        print("check1")
        name1 = csvReader.get_latest_name(name1_base_name)
    if name2 == "":