Пример #1
0
def show_histogram(dataset, labeling=LABELED_FILENAME, field=None):

    if labeling == ALL_FILENAME:
        labeled_filename = get_dataset_filename(dataset, LABELED_FILENAME,
                                                FILTERED_POSTFIX,
                                                JSON_FILE_EXTENSION)
        unlabeled_filename = get_dataset_filename(dataset, UNLABELED_FILENAME,
                                                  FILTERED_POSTFIX,
                                                  JSON_FILE_EXTENSION)
        data = load_json(labeled_filename) + load_json(unlabeled_filename)
    else:
        filename = get_dataset_filename(dataset, labeling, FILTERED_POSTFIX,
                                        JSON_FILE_EXTENSION)
        data = load_json(filename)

    if data is None:
        print("No data was selected")
        sys.exit()

    texts = get_texts(data, field)
    text_lengths = [len(text.split()) for text in texts]

    print("Mean, words:", np.mean(text_lengths))
    print("Median, words:", np.median(text_lengths))
    print("Standard deviation, words:", np.std(text_lengths))
    print("Minimum, words:", np.min(text_lengths))
    print("Maximum, words:", np.max(text_lengths))
    print("90th percentile", np.percentile(text_lengths, 90))

    need_upper_limit = input(
        "Would you like to put a constraint on the maximum text length displayed? (y/n) "
    ) == "y"
    if need_upper_limit:
        upper_limit = int(
            input("Please enter the upper text length limit (words): "))

    min_length = min(text_lengths)
    max_length = max(text_lengths)
    if need_upper_limit:
        max_length = min(upper_limit, max_length)

    suggested_number_of_bins = [
        i for i in range(5, 30) if (max_length - min_length) % i == 0
    ]
    print("Suggested number of bins is:", *suggested_number_of_bins)
    bins = int(input("Please input the number of bins: "))

    plt.figure(figsize=(12, 7))
    plt.hist(text_lengths, bins=bins, range=(min_length, max_length))
    step = (max_length - min_length) / bins
    plt.xticks(np.arange(min_length, max_length + 1, step))
    plt.xlim(min_length, max_length)
    plt.xlabel(get_x_label(field))
    plt.ylabel("Number of records")

    create_folder_if_needed(STATISTICS_FOLDER)
    filename = get_statistics_image_filename(dataset, TEXT_LENGTH_STAT)
    plt.savefig(filename, bbox_inches=PLOT_BBOX_INCHES)

    print("Text length histogram saved at %s" % filename)
Пример #2
0
def train_gensim(dataset,
                 algorithm,
                 embedding_size,
                 minimum_count,
                 window_size,
                 iterations,
                 notes_filename,
                 data=None,
                 save=True,
                 workers=4):

    if data == None:
        labeled_filename = get_dataset_filename(dataset, LABELED_FILENAME,
                                                FILTERED_POSTFIX,
                                                JSON_FILE_EXTENSION)
        unlabeled_filename = get_dataset_filename(dataset, UNLABELED_FILENAME,
                                                  FILTERED_POSTFIX,
                                                  JSON_FILE_EXTENSION)

        labeled_data = load_json(labeled_filename)
        unlabeled_data = load_json(unlabeled_filename)

        data = labeled_data if labeled_data is not None else [] + unlabeled_data if unlabeled_data is not None else []

    training_sentences = []
    for datapoint in data:
        sentences = datapoint[SUMMARY_FIELD_KEY]
        if datapoint.get(DESCRIPTION_FIELD_KEY) is not None:
            sentences = sentences + datapoint.get(DESCRIPTION_FIELD_KEY)
        for sentence in sentences:
            training_sentences.append([word for word in sentence.split()])
    print("Sentences prepared")

    model = Word2Vec(training_sentences,
                     min_count=minimum_count,
                     size=embedding_size,
                     window=window_size,
                     sg=1 if algorithm == "skip-gram" else 0,
                     compute_loss=True,
                     iter=iterations,
                     seed=7,
                     workers=workers)

    if notes_filename is not None:
        with open(notes_filename, "a") as notes_filename:
            print("Gensim model loss:",
                  model.get_latest_training_loss(),
                  file=notes_filename)

    if save == True:
        filename = get_dataset_filename(dataset, ALL_FILENAME, GENSIM_MODEL,
                                        PICKLE_FILE_EXTENSION)
        model.save(filename)
        print("Model saved at", filename)

    return model
Пример #3
0
def count_tokens(dataset, notes_filename, data=None, save=True):

    if data is None:

        labeled_data_filename = get_dataset_filename(dataset, LABELED_FILENAME,
                                                     FILTERED_POSTFIX,
                                                     JSON_FILE_EXTENSION)
        labeled_data = load_json(labeled_data_filename)

        unlabeled_data_filename = get_dataset_filename(dataset,
                                                       UNLABELED_FILENAME,
                                                       FILTERED_POSTFIX,
                                                       JSON_FILE_EXTENSION)
        unlabeled_data = load_json(unlabeled_data_filename)

        data = labeled_data
        if unlabeled_data is not None:
            data = data + unlabeled_data

    print("Counting tokens...")

    token_counts = {}

    for datapoint in data:

        for text_key in [SUMMARY_FIELD_KEY, DESCRIPTION_FIELD_KEY]:
            if datapoint.get(text_key) is not None:
                summary_words = merge_sentences(datapoint[text_key]).split()
                for word in summary_words:
                    token_counts[word] = token_counts.get(word, 0) + 1

    print("Sorting...")
    token_counts = sorted(token_counts.items(),
                          key=lambda x: x[1],
                          reverse=True)

    if save == True:
        filename = get_dataset_filename(dataset, ALL_FILENAME,
                                        TOKEN_COUNT_POSTFIX,
                                        JSON_FILE_EXTENSION)
        save_json(filename, token_counts)
        print("Token counts and frequencies saved at %s" % filename)

    with open(notes_filename, "a") as notes_file:
        print("%d different unique tokens" % (len(token_counts)),
              file=notes_file)

    return token_counts
Пример #4
0
def show_histogram(dataset):

    filename = get_dataset_filename(dataset, LABELED_FILENAME,
                                    FILTERED_POSTFIX, JSON_FILE_EXTENSION)
    data = load_json(filename)

    if data is None:
        return

    project_issue_counts = get_issue_counts(data)
    issue_counts = [c[1] for c in project_issue_counts]

    for project, issue_count in project_issue_counts:
        print("%s - %d issues" % (project, issue_count))
    print("Number of projects:", len(issue_counts))

    min_size = min(issue_counts)
    max_size = max(issue_counts)

    need_upper_limit = input(
        "Would you like to put a constraint on the maximum project size displayed? (y/n) "
    ) == "y"
    if need_upper_limit:
        upper_limit = int(input("Please enter the upper project size limit: "))

    if need_upper_limit:
        max_size = min(upper_limit, max_size)

    suggested_number_of_bins = [
        i for i in range(5, 30) if (max_size - min_size) % i == 0
    ]
    print("Suggested number of bins is:", *suggested_number_of_bins)
    bins = int(input("Please input the number of bins: "))

    plt.figure(figsize=(12, 7))
    plt.hist(issue_counts, bins=bins, range=(min_size, max_size))
    step = (max_size - min_size) / bins
    plt.xticks(np.arange(min_size, max_size + 1, step))
    plt.xlim(min_size, max_size)
    plt.xlabel("Number of labeled datapoints in project")
    plt.ylabel("Number of projects")

    create_folder_if_needed(STATISTICS_FOLDER)
    filename = get_statistics_image_filename(dataset, PROJECT_SIZE_STAT)
    plt.savefig(filename, bbox_inches=PLOT_BBOX_INCHES)

    print("Project size histogram saved at %s" % filename)
Пример #5
0
def load_and_parse_data(datasets, labeling):

    data = []
    for dataset in datasets:
        filename = get_repository_filename(dataset, labeling, CLEANED_POSTFIX,
                                           JSON_FILE_EXTENSION)
        dataset_data = load_json(filename)

        if dataset_data is None:

            print("%s does not contain %s datapoints with cleaned text" %
                  (dataset,
                   "labeled" if labeling == LABELED_FILENAME else "unlabeled"))
            continue

        for dataset_datapoint in dataset_data:

            if dataset_datapoint.get(SUMMARY_FIELD_KEY) is None:
                continue

            training_datapoint = {
                ID_FIELD_KEY:
                int(dataset_datapoint[ID_FIELD_KEY]),
                PROJECT_FIELD_KEY:
                "%s-%s" % (dataset, dataset_datapoint[PROJECT_FIELD_KEY]),
                SUMMARY_FIELD_KEY:
                dataset_datapoint[SUMMARY_FIELD_KEY]
            }
            if DESCRIPTION_FIELD_KEY in dataset_datapoint:
                training_datapoint[DESCRIPTION_FIELD_KEY] = dataset_datapoint[
                    DESCRIPTION_FIELD_KEY]
            if TIMESPENT_FIELD_KEY in dataset_datapoint:
                training_datapoint[TIMESPENT_FIELD_KEY] = int(
                    dataset_datapoint[TIMESPENT_FIELD_KEY])
            if ALPHA_FIELD in dataset_datapoint:
                training_datapoint[ALPHA_FIELD] = dataset_datapoint[
                    ALPHA_FIELD]

            data.append(training_datapoint)

    if len(data) == 0:
        print("No %s data was selected" %
              ("labeled" if labeling == LABELED_FILENAME else "unlabeled"))
        return

    return data
Пример #6
0
def load_and_arrange(dataset, split_percentage, split_fields, max_length, lookup, labeled_data=None):

    if labeled_data is None:
        data_filename = get_dataset_filename(dataset, LABELED_FILENAME, FILTERED_POSTFIX, JSON_FILE_EXTENSION)
        labeled_data = load_json(data_filename)

    shuffled_data = ordered_shuffle(labeled_data)
    del labeled_data

    if split_fields == True:
        x_strings_arr = []
        x_strings_arr.append([merge_sentences(datapoint.get(SUMMARY_FIELD_KEY)) for datapoint in shuffled_data])
        x_strings_arr.append([merge_sentences(datapoint.get(DESCRIPTION_FIELD_KEY, [])) for datapoint in shuffled_data])
    else:
        x_strings_arr = [[merge_sentences(datapoint.get(SUMMARY_FIELD_KEY) + datapoint.get(DESCRIPTION_FIELD_KEY, [])) for datapoint in shuffled_data]]
    
    y = np.array([datapoint[TIMESPENT_FIELD_KEY] / SECONDS_IN_HOUR for datapoint in shuffled_data])
    del shuffled_data

    print("Converting data to numeric format and creating vector dictionary...")
    x = []
    string_dictionary = {}
    vector_dictionary = []

    for i, x_strings in enumerate(x_strings_arr):
        numeric_x_strings, string_dictionary, vector_dictionary = convert_to_numeric(
            x_strings,
            string_dictionary,
            vector_dictionary,
            lookup,
            max_length[i])
        numeric_padded_x = pad_sequences(numeric_x_strings, maxlen=max_length[i])
        x.append(numeric_padded_x)

    vector_dictionary.insert(0, [0] * len(vector_dictionary[0]))
    vector_dictionary = np.array(vector_dictionary)

    return split_train_test_val((x, y), split_percentage), vector_dictionary
Пример #7
0
def show_histogram(dataset):

    filename = get_dataset_filename(dataset, LABELED_FILENAME,
                                    FILTERED_POSTFIX, JSON_FILE_EXTENSION)
    data = load_json(filename)

    if data is None:
        return

    y = [
        datapoint[TIMESPENT_FIELD_KEY] / SECONDS_IN_HOUR for datapoint in data
    ]
    print("Mean, hours:", np.mean(y))
    print("Median, hours:", np.median(y))
    print("Standard deviation, hours:", np.std(y))
    print("Minimum, hours:", np.min(y))
    print("Maximum, hours:", np.max(y))

    max_hours = int(
        input(
            "Please input the maximum number of hours to display in the histogram: "
        ))

    plt.figure(figsize=(12, 7))
    plt.hist(y,
             bins=max_hours * 12,
             range=(0, max_hours - 1 / SECONDS_IN_HOUR))
    plt.xticks(np.arange(0, max_hours + 1, 1))
    plt.xlim(0, max_hours)
    plt.xlabel("Time spent, hours")
    plt.ylabel("Number of tasks")

    create_folder_if_needed(STATISTICS_FOLDER)
    filename = get_statistics_image_filename(dataset, LABEL_DISTRIBUTION_STAT)
    plt.savefig(filename, bbox_inches=PLOT_BBOX_INCHES)

    print("Label distribution histogram saved at %s" % filename)
Пример #8
0
def spacy_lookup(dataset, notes_filename, token_counts=None, save=True):

    if token_counts is None:
        token_count_filename = get_dataset_filename(dataset, ALL_FILENAME,
                                                    TOKEN_COUNT_POSTFIX,
                                                    JSON_FILE_EXTENSION)
        token_counts = load_json(token_count_filename)

    nlp = spacy.load('en_vectors_web_lg')

    print("Creating lookup table...")
    no_vector_count = 0
    lookup = {}
    for word in token_counts:

        doc = nlp(word[0])
        if doc[0].has_vector == False:
            no_vector_count += 1
            continue

        lookup[word[0]] = doc[0].vector.tolist()

    with open(notes_filename, "a") as notes_file:
        print("%d (%.0f%%) of %d dictionary words had Spacy vectors" %
              get_part_strings(len(lookup),
                               len(lookup) + no_vector_count),
              file=notes_file)

    if save == True:
        print("Saving...")
        lookup_filename = get_dataset_filename(dataset, ALL_FILENAME,
                                               SPACY_LOOKUP_POSTFIX,
                                               JSON_FILE_EXTENSION)
        save_json(lookup_filename, lookup)
        print("Lookup table saved at", lookup_filename)

    return lookup
Пример #9
0
def load_dataset(dataset, labeling):

    filename = get_dataset_filename(dataset, labeling, MERGED_POSTFIX,
                                    JSON_FILE_EXTENSION)
    return load_json(filename)
Пример #10
0
from data_collection.fetch_data import fetch_data
from utilities.file_utils import load_json

from utilities.constants import DATA_FOLDER, DATA_COLLECTION_FOLDER

REPOSITORY_LIST_FILENAME = DATA_COLLECTION_FOLDER + "/known_repos.json"


def fetch_repositories(repositories):
    """Fetching data from a list of JIRA repositories"""

    if repositories is None:
        print("No JIRA repositories were found at", REPOSITORY_LIST_FILENAME)
        return

    for repository in repositories:
        if not os.path.exists("%s/%s" % (DATA_FOLDER, repository[0])):
            try:
                fetch_data(repository[0], repository[1])
            except Exception as e:
                print(
                    "Skipping %s because the following exception was thrown:" %
                    repository[1])
                print(e)
                continue


if __name__ == "__main__":

    repositories = load_json(REPOSITORY_LIST_FILENAME)
    fetch_repositories(repositories)