def show_histogram(dataset, labeling=LABELED_FILENAME, field=None): if labeling == ALL_FILENAME: labeled_filename = get_dataset_filename(dataset, LABELED_FILENAME, FILTERED_POSTFIX, JSON_FILE_EXTENSION) unlabeled_filename = get_dataset_filename(dataset, UNLABELED_FILENAME, FILTERED_POSTFIX, JSON_FILE_EXTENSION) data = load_json(labeled_filename) + load_json(unlabeled_filename) else: filename = get_dataset_filename(dataset, labeling, FILTERED_POSTFIX, JSON_FILE_EXTENSION) data = load_json(filename) if data is None: print("No data was selected") sys.exit() texts = get_texts(data, field) text_lengths = [len(text.split()) for text in texts] print("Mean, words:", np.mean(text_lengths)) print("Median, words:", np.median(text_lengths)) print("Standard deviation, words:", np.std(text_lengths)) print("Minimum, words:", np.min(text_lengths)) print("Maximum, words:", np.max(text_lengths)) print("90th percentile", np.percentile(text_lengths, 90)) need_upper_limit = input( "Would you like to put a constraint on the maximum text length displayed? (y/n) " ) == "y" if need_upper_limit: upper_limit = int( input("Please enter the upper text length limit (words): ")) min_length = min(text_lengths) max_length = max(text_lengths) if need_upper_limit: max_length = min(upper_limit, max_length) suggested_number_of_bins = [ i for i in range(5, 30) if (max_length - min_length) % i == 0 ] print("Suggested number of bins is:", *suggested_number_of_bins) bins = int(input("Please input the number of bins: ")) plt.figure(figsize=(12, 7)) plt.hist(text_lengths, bins=bins, range=(min_length, max_length)) step = (max_length - min_length) / bins plt.xticks(np.arange(min_length, max_length + 1, step)) plt.xlim(min_length, max_length) plt.xlabel(get_x_label(field)) plt.ylabel("Number of records") create_folder_if_needed(STATISTICS_FOLDER) filename = get_statistics_image_filename(dataset, TEXT_LENGTH_STAT) plt.savefig(filename, bbox_inches=PLOT_BBOX_INCHES) print("Text length histogram saved at %s" % filename)
def train_gensim(dataset, algorithm, embedding_size, minimum_count, window_size, iterations, notes_filename, data=None, save=True, workers=4): if data == None: labeled_filename = get_dataset_filename(dataset, LABELED_FILENAME, FILTERED_POSTFIX, JSON_FILE_EXTENSION) unlabeled_filename = get_dataset_filename(dataset, UNLABELED_FILENAME, FILTERED_POSTFIX, JSON_FILE_EXTENSION) labeled_data = load_json(labeled_filename) unlabeled_data = load_json(unlabeled_filename) data = labeled_data if labeled_data is not None else [] + unlabeled_data if unlabeled_data is not None else [] training_sentences = [] for datapoint in data: sentences = datapoint[SUMMARY_FIELD_KEY] if datapoint.get(DESCRIPTION_FIELD_KEY) is not None: sentences = sentences + datapoint.get(DESCRIPTION_FIELD_KEY) for sentence in sentences: training_sentences.append([word for word in sentence.split()]) print("Sentences prepared") model = Word2Vec(training_sentences, min_count=minimum_count, size=embedding_size, window=window_size, sg=1 if algorithm == "skip-gram" else 0, compute_loss=True, iter=iterations, seed=7, workers=workers) if notes_filename is not None: with open(notes_filename, "a") as notes_filename: print("Gensim model loss:", model.get_latest_training_loss(), file=notes_filename) if save == True: filename = get_dataset_filename(dataset, ALL_FILENAME, GENSIM_MODEL, PICKLE_FILE_EXTENSION) model.save(filename) print("Model saved at", filename) return model
def count_tokens(dataset, notes_filename, data=None, save=True): if data is None: labeled_data_filename = get_dataset_filename(dataset, LABELED_FILENAME, FILTERED_POSTFIX, JSON_FILE_EXTENSION) labeled_data = load_json(labeled_data_filename) unlabeled_data_filename = get_dataset_filename(dataset, UNLABELED_FILENAME, FILTERED_POSTFIX, JSON_FILE_EXTENSION) unlabeled_data = load_json(unlabeled_data_filename) data = labeled_data if unlabeled_data is not None: data = data + unlabeled_data print("Counting tokens...") token_counts = {} for datapoint in data: for text_key in [SUMMARY_FIELD_KEY, DESCRIPTION_FIELD_KEY]: if datapoint.get(text_key) is not None: summary_words = merge_sentences(datapoint[text_key]).split() for word in summary_words: token_counts[word] = token_counts.get(word, 0) + 1 print("Sorting...") token_counts = sorted(token_counts.items(), key=lambda x: x[1], reverse=True) if save == True: filename = get_dataset_filename(dataset, ALL_FILENAME, TOKEN_COUNT_POSTFIX, JSON_FILE_EXTENSION) save_json(filename, token_counts) print("Token counts and frequencies saved at %s" % filename) with open(notes_filename, "a") as notes_file: print("%d different unique tokens" % (len(token_counts)), file=notes_file) return token_counts
def show_histogram(dataset): filename = get_dataset_filename(dataset, LABELED_FILENAME, FILTERED_POSTFIX, JSON_FILE_EXTENSION) data = load_json(filename) if data is None: return project_issue_counts = get_issue_counts(data) issue_counts = [c[1] for c in project_issue_counts] for project, issue_count in project_issue_counts: print("%s - %d issues" % (project, issue_count)) print("Number of projects:", len(issue_counts)) min_size = min(issue_counts) max_size = max(issue_counts) need_upper_limit = input( "Would you like to put a constraint on the maximum project size displayed? (y/n) " ) == "y" if need_upper_limit: upper_limit = int(input("Please enter the upper project size limit: ")) if need_upper_limit: max_size = min(upper_limit, max_size) suggested_number_of_bins = [ i for i in range(5, 30) if (max_size - min_size) % i == 0 ] print("Suggested number of bins is:", *suggested_number_of_bins) bins = int(input("Please input the number of bins: ")) plt.figure(figsize=(12, 7)) plt.hist(issue_counts, bins=bins, range=(min_size, max_size)) step = (max_size - min_size) / bins plt.xticks(np.arange(min_size, max_size + 1, step)) plt.xlim(min_size, max_size) plt.xlabel("Number of labeled datapoints in project") plt.ylabel("Number of projects") create_folder_if_needed(STATISTICS_FOLDER) filename = get_statistics_image_filename(dataset, PROJECT_SIZE_STAT) plt.savefig(filename, bbox_inches=PLOT_BBOX_INCHES) print("Project size histogram saved at %s" % filename)
def load_and_parse_data(datasets, labeling): data = [] for dataset in datasets: filename = get_repository_filename(dataset, labeling, CLEANED_POSTFIX, JSON_FILE_EXTENSION) dataset_data = load_json(filename) if dataset_data is None: print("%s does not contain %s datapoints with cleaned text" % (dataset, "labeled" if labeling == LABELED_FILENAME else "unlabeled")) continue for dataset_datapoint in dataset_data: if dataset_datapoint.get(SUMMARY_FIELD_KEY) is None: continue training_datapoint = { ID_FIELD_KEY: int(dataset_datapoint[ID_FIELD_KEY]), PROJECT_FIELD_KEY: "%s-%s" % (dataset, dataset_datapoint[PROJECT_FIELD_KEY]), SUMMARY_FIELD_KEY: dataset_datapoint[SUMMARY_FIELD_KEY] } if DESCRIPTION_FIELD_KEY in dataset_datapoint: training_datapoint[DESCRIPTION_FIELD_KEY] = dataset_datapoint[ DESCRIPTION_FIELD_KEY] if TIMESPENT_FIELD_KEY in dataset_datapoint: training_datapoint[TIMESPENT_FIELD_KEY] = int( dataset_datapoint[TIMESPENT_FIELD_KEY]) if ALPHA_FIELD in dataset_datapoint: training_datapoint[ALPHA_FIELD] = dataset_datapoint[ ALPHA_FIELD] data.append(training_datapoint) if len(data) == 0: print("No %s data was selected" % ("labeled" if labeling == LABELED_FILENAME else "unlabeled")) return return data
def load_and_arrange(dataset, split_percentage, split_fields, max_length, lookup, labeled_data=None): if labeled_data is None: data_filename = get_dataset_filename(dataset, LABELED_FILENAME, FILTERED_POSTFIX, JSON_FILE_EXTENSION) labeled_data = load_json(data_filename) shuffled_data = ordered_shuffle(labeled_data) del labeled_data if split_fields == True: x_strings_arr = [] x_strings_arr.append([merge_sentences(datapoint.get(SUMMARY_FIELD_KEY)) for datapoint in shuffled_data]) x_strings_arr.append([merge_sentences(datapoint.get(DESCRIPTION_FIELD_KEY, [])) for datapoint in shuffled_data]) else: x_strings_arr = [[merge_sentences(datapoint.get(SUMMARY_FIELD_KEY) + datapoint.get(DESCRIPTION_FIELD_KEY, [])) for datapoint in shuffled_data]] y = np.array([datapoint[TIMESPENT_FIELD_KEY] / SECONDS_IN_HOUR for datapoint in shuffled_data]) del shuffled_data print("Converting data to numeric format and creating vector dictionary...") x = [] string_dictionary = {} vector_dictionary = [] for i, x_strings in enumerate(x_strings_arr): numeric_x_strings, string_dictionary, vector_dictionary = convert_to_numeric( x_strings, string_dictionary, vector_dictionary, lookup, max_length[i]) numeric_padded_x = pad_sequences(numeric_x_strings, maxlen=max_length[i]) x.append(numeric_padded_x) vector_dictionary.insert(0, [0] * len(vector_dictionary[0])) vector_dictionary = np.array(vector_dictionary) return split_train_test_val((x, y), split_percentage), vector_dictionary
def show_histogram(dataset): filename = get_dataset_filename(dataset, LABELED_FILENAME, FILTERED_POSTFIX, JSON_FILE_EXTENSION) data = load_json(filename) if data is None: return y = [ datapoint[TIMESPENT_FIELD_KEY] / SECONDS_IN_HOUR for datapoint in data ] print("Mean, hours:", np.mean(y)) print("Median, hours:", np.median(y)) print("Standard deviation, hours:", np.std(y)) print("Minimum, hours:", np.min(y)) print("Maximum, hours:", np.max(y)) max_hours = int( input( "Please input the maximum number of hours to display in the histogram: " )) plt.figure(figsize=(12, 7)) plt.hist(y, bins=max_hours * 12, range=(0, max_hours - 1 / SECONDS_IN_HOUR)) plt.xticks(np.arange(0, max_hours + 1, 1)) plt.xlim(0, max_hours) plt.xlabel("Time spent, hours") plt.ylabel("Number of tasks") create_folder_if_needed(STATISTICS_FOLDER) filename = get_statistics_image_filename(dataset, LABEL_DISTRIBUTION_STAT) plt.savefig(filename, bbox_inches=PLOT_BBOX_INCHES) print("Label distribution histogram saved at %s" % filename)
def spacy_lookup(dataset, notes_filename, token_counts=None, save=True): if token_counts is None: token_count_filename = get_dataset_filename(dataset, ALL_FILENAME, TOKEN_COUNT_POSTFIX, JSON_FILE_EXTENSION) token_counts = load_json(token_count_filename) nlp = spacy.load('en_vectors_web_lg') print("Creating lookup table...") no_vector_count = 0 lookup = {} for word in token_counts: doc = nlp(word[0]) if doc[0].has_vector == False: no_vector_count += 1 continue lookup[word[0]] = doc[0].vector.tolist() with open(notes_filename, "a") as notes_file: print("%d (%.0f%%) of %d dictionary words had Spacy vectors" % get_part_strings(len(lookup), len(lookup) + no_vector_count), file=notes_file) if save == True: print("Saving...") lookup_filename = get_dataset_filename(dataset, ALL_FILENAME, SPACY_LOOKUP_POSTFIX, JSON_FILE_EXTENSION) save_json(lookup_filename, lookup) print("Lookup table saved at", lookup_filename) return lookup
def load_dataset(dataset, labeling): filename = get_dataset_filename(dataset, labeling, MERGED_POSTFIX, JSON_FILE_EXTENSION) return load_json(filename)
from data_collection.fetch_data import fetch_data from utilities.file_utils import load_json from utilities.constants import DATA_FOLDER, DATA_COLLECTION_FOLDER REPOSITORY_LIST_FILENAME = DATA_COLLECTION_FOLDER + "/known_repos.json" def fetch_repositories(repositories): """Fetching data from a list of JIRA repositories""" if repositories is None: print("No JIRA repositories were found at", REPOSITORY_LIST_FILENAME) return for repository in repositories: if not os.path.exists("%s/%s" % (DATA_FOLDER, repository[0])): try: fetch_data(repository[0], repository[1]) except Exception as e: print( "Skipping %s because the following exception was thrown:" % repository[1]) print(e) continue if __name__ == "__main__": repositories = load_json(REPOSITORY_LIST_FILENAME) fetch_repositories(repositories)