def check_output(name): data = csvReader.readfile(name) print(data[0]) print(data[0]["yaml"]) print(type(data[0]["yaml"])) for line in data: if line.get("yaml") == "False": print(line) break
def spread_of_data_line_star_other_paper(): data = csvReader.readfile("breadth_corpus.csv") results = [] for line in data: if line.get("CI") and int(line.get("CI")) > 0: results.append((int(line.get("stars")), 1)) else: results.append((int(line.get("stars")), 0)) return create_percentage_bar_graphs(results, "percentage of stars that use CI", "stars")
def check(filename): if filename is None: return 0 lines = csvReader.readfile(filename) if len(lines) == 0: keys_length = 0 else: keys_length = len(lines[0].keys()) readme = 0 jenkins = 0 for line in lines: if line.get("readme") is not None: readme += 1 if line.get("jenkinsPipeline0"): jenkins += 1 print("filename: {}{}{}{}{}".format(filename, pad(keys_length, 20), pad(len(lines), 20), pad(readme), pad(jenkins))) return len(lines)
def merge(mypath, save=True, query="raptor"): onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath, f)) and f.endswith(".csv") and query in f ] combined = {} duplicates = 0 forks = 0 for i in range(len(onlyfiles)): tempfiles = csvReader.readfile(join(mypath, onlyfiles[i])) for line in tempfiles: for k in csvReader.KEYS_TO_TRY: # if there is no config if line.get(k) is None: line[k] = "" # watchers not working if line.get("watchers") is None: line["watchers"] = 0 # duplicates numbers if combined.get(line.get("id")) is not None: duplicates += 1 if line.get("fork") is not None and isinstance( line.get("fork"), bool) and line.get("fork"): forks += 1 combined[line.get("id")] = line print("duplicates: ", duplicates) print("forks: ", forks) print("results: ", len(combined.values())) if save: name = csvReader.check_name("combined") if name: csvReader.writeToCsv(list(combined.values()), name, fields=FIELDS) else: print("too many combined copies already found") return f"{name}.csv" return None
def main(experimenting, name1, name2, image_encoding, output="."): if experimenting: # data = csvReader.readfile("combined1.csv") # sorted_data = csvReader.readfile("yaml threaded6.csv") # spread_of_data_v2(data, sorted_data).show() # plt.clf() # spread_of_data_line_sub(data, sorted_data).show() # save_as_pdf(language_type(load_dataframe(name1), sorted_data), f"{output}/languages", image_encoding) # languages_table_topn(f"{output}/languages table.tex", 20, load_dataframe(name1), sorted_data) # langs = languages_table_topn(f"{output}/languages table.tex", 30, load_dataframe(name1), sorted_data) # save_as_pdf(popularity_vs_percentage_CI_scatter(langs, sorted_data), f"{output}/languages-scatter-CI", # image_encoding) # save_as_pdf(config_topn(sorted_data, 20), f"{output}/config-topn", image_encoding) # langs = languages_table_topn(f"{output}/languages table.tex", 30, load_dataframe(name1), sorted_data) # save_as_pdf(popularity_vs_percentage_CI_scatter(langs, sorted_data), f"{output}/languages-scatter-CI", # image_encoding) # code_with_comments(sorted_data, "code", "comments") # code_with_comments(sorted_data, "code", "file_lines") # code_with_comments(sorted_data, "code", "blank_lines") # code_with_comments(sorted_data, "single_line_comment", "multi_line_comment") # code_with_comments(sorted_data, "multi_line_comment", "multi_line_comment_unique") # # code_with_comments(sorted_data, "code", "comments", "lang") # code_with_comments(sorted_data, "code", "file_lines", "lang") # save_as_pdf(line_usage_configuration(sorted_data[sorted_data["yaml"]]).show() # save_as_pdf(line_usage_configuration2(sorted_data[sorted_data["yaml"]]).show() # save_as_pdf(line_usage_configuration2(sorted_data[sorted_data["yaml"] == False]).show() asfdasdfasdfafdasdfasdfasdfasdfsdf # comment_uage_tbale(sorted_data) # save_as_pdf(line_usage_configuration(sorted_data), f"{output}/line structure all", image_encoding) # save_as_pdf(line_usage_configuration(sorted_data[sorted_data["yaml"]]), f"{output}/line structure yaml", # image_encoding) # save_as_pdf(line_usage_configuration2(sorted_data[sorted_data["yaml"]]), # f"{output}/line structure yaml comments", image_encoding) # save_as_pdf(line_usage_configuration2(sorted_data[sorted_data["yaml"] == False]), # f"{output}/line structure none yaml comments", image_encoding) # # # data = csvReader.readfile(name1) # # # # # save_as_pdf(spread_of_data_sub_to_stars(data), f"{output}/sub vs stars", image_encoding) # scripts_latex(f"{output}/scripts table new.tex", sorted_data) # save_as_pdf(script_usage(sorted_data), f"{output}/scripts usage bars", image_encoding) # save_as_pdf(lines_against_scripts(sorted_data[sorted_data["yaml"]]), f"{output}/scripts vs lines", image_encoding) # save_as_pdf(stars_against_lines(sorted_data), f"{output}/scripts vs stars", image_encoding) sorted_data = load_dataframe(name2) data = pd.read_csv("2020 combined.csv", dtype={"id": int, "language": str}, parse_dates=[ "recent_commit1", "recent_commit2", "recent_commit3"]) # get_last_years_ci_usage(sorted_data, data, None, None) # get_last_years_ci_usage(sorted_data, data, "lang","language", "Rust") # get_last_years_ci_usage(sorted_data, data, "lang","language", "JavaScript") # get_last_years_ci_usage(sorted_data, data, "lang","language", "Go") # get_last_years_ci_usage(sorted_data, data, "lang","language", "Python") # get_last_years_ci_usage(sorted_data, data, "lang","language", "C++") # get_last_years_ci_usage(sorted_data, data, "lang","language", "Java") get_last_years_ci_usage(sorted_data, data, "lang", "language") # recent_commit1 = get_last_timeframe(data, "recent_commit1", "12M") # print("There are {} repositories that had their last commit in the last year. This is {} percentage of the total sample size.".format(len(recent_commit1), len(recent_commit1)/len(data)*100)) # print("Out of all the last commits {} had CI".format(len(commit1)/len(recent_commit1)*100)) # ci_two_years = len(get_last_timeframe(sorted_data, "commit_1", "2Y")) - len(commit1) # total_two_years = len(get_last_timeframe(data, "recent_commit1", "2Y")) - len(recent_commit1) # print("Out of all the last commits {} had CI".format(ci_two_years/total_two_years*100)) # # # save_as_pdf(ci_usage_against_commits(data, sorted_data), f"{output}/cats", image_encoding) # # RQ3 # langs = languages_table_topn(f"{output}/2019 languages table.tex", 30, load_dataframe(name1), sorted_data) # save_as_pdf(popularity_vs_percentage_CI_scatter(langs, sorted_data), f"{output}/2019-languages-scatter-CI", # image_encoding) else: data = csvReader.readfile(name1) # # save_as_pdf(spread_of_data_sub_to_stars(data), f"{output}/sub vs stars", image_encoding) save_as_pdf(spread_over_time_stars(data), f"{output}/spread over time", image_encoding) save_as_pdf(spread_data_issues_vs_stars(data), f"{output}/issues vs stars", image_encoding) sorted_data = load_dataframe(name2) yaml_config_errors_to_latex( f"{output}/yaml config errors.tex", sorted_data) # should not need to rerun this unless more scraping is done!!! # commented out as manual edits to the formatting are easier than code ones atm # config_type_split(f"{output}/configuration type count.tex", sorted_data) save_as_pdf(config_topn(sorted_data, 20), f"{output}/config-topn", image_encoding) # RQ3 langs = languages_table_topn( f"{output}/languages table.tex", 30, pd.read_csv(name1, dtype=dtypes, parse_dates=["recent_commit1", "recent_commit2", "recent_commit3"]), sorted_data) save_as_pdf(popularity_vs_percentage_CI_scatter(langs, sorted_data), f"{output}/languages-scatter-CI", image_encoding) save_as_pdf(langues_topn(sorted_data, 30), f"{output}/languages-topn", image_encoding) save_as_pdf(language_type(pd.read_csv(name1, dtype=dtypes,parse_dates=[ "recent_commit1", "recent_commit2", "recent_commit3"]), sorted_data), f"{output}/languages", image_encoding) # -------------- sorted_data_csv = csvReader.readfile(name2) save_as_pdf(spread_of_data_line_star(data, sorted_data_csv), f"{output}/percentage stars with CI", image_encoding) save_as_pdf(spread_of_data_line_sub(data, sorted_data_csv), f"{output}/percentage sub with CI", image_encoding) save_as_pdf(spread_of_data_line_star_other_paper(), f"{output}/percentage sub with CI other paper source", image_encoding) # render_sankey_diagram.save_sanky_daigram_for_errors_and_comments(f"./{output}/sankey", # pd.read_csv(name2, dtype=dtypes), False, False, # image_encoding) # render_sankey_diagram.save_sanky_daigram_for_errors_and_comments(f"./{output}/sankey2", # pd.read_csv(name2, dtype=dtypes), False, True, # image_encoding) # render_sankey_diagram.save_sanky_daigram_for_errors_and_comments(f"./{output}/sankey3", # pd.read_csv(name2, dtype=dtypes), True, False, save_as_pdf(line_usage_configuration(sorted_data), f"{output}/line structure all", image_encoding) save_as_pdf(line_usage_configuration(sorted_data[sorted_data["yaml"]]), f"{output}/line structure yaml", image_encoding) save_as_pdf(line_usage_configuration2(sorted_data[sorted_data["yaml"]]), f"{output}/line structure yaml comments", image_encoding) save_as_pdf(line_usage_configuration2(sorted_data[sorted_data["yaml"] == False]), f"{output}/line structure none yaml comments", image_encoding) save_as_pdf(comment_usage(sorted_data), f"{output}/comments usage bars", image_encoding) save_as_pdf(script_usage(sorted_data), f"{output}/scripts usage bars", image_encoding) scripts_latex(f"{output}/scripts table new.tex", sorted_data) save_as_pdf(lines_against_scripts(sorted_data), f"{output}/scripts vs lines", image_encoding) save_as_pdf(stars_against_lines(sorted_data), f"{output}/scripts vs stars", image_encoding) return sorted_data
name = csvReader.check_name(name, limit=20) if name == "": print( "file already found for the files for the main file so can't write to disk" ) return print(f"writing to {name}.csv") with open(f"{name}.csv", "a", newline="", encoding="utf-8") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=FIELDS, quoting=csv.QUOTE_MINIMAL) writer.writeheader() run_main(num_worker_threads, data, name) write_to_latex(f"{output_for_latex}/generated_table2.tex", len(data), name) return f"{name}.csv" if __name__ == '__main__': main("2020 yaml threaded", csvReader.readfile("2020 combined.csv"), "./results") print(len(results)) print(len(results_ci)) print(no_readme) # check_output("yaml threaded5.csv")
name2 = "" user_input = "" print("config check:{} merge:{} parse:{} render:{}".format( CHECK, MERGE, PARSE, RENDER)) s = time.time() if CHECK: checker.merge("./newData", query="socket", save=False) user_input = input("do you want to save the merge?").replace(" ", "") while user_input not in ["yes", "y", "no", "n"]: user_input = input("do you want to save the merge?").replace(" ", "") if (MERGE and user_input == "") or user_input in ["yes", "y"]: print("running merge") name1 = checker.merge("./newData", query="socket", save=True) name2 = data_parser.main(name2_base_name, csvReader.readfile(name1), OUTPUT_RESULTS_PATH) if PARSE: print("parsing data") if name1 == "": name1 = csvReader.get_latest_name(name1_base_name) name2 = data_parser.main(name2_base_name, csvReader.readfile(name1), OUTPUT_RESULTS_PATH) if RENDER: print("rendering") if name1 == "": print("check1") name1 = csvReader.get_latest_name(name1_base_name) if name2 == "":