def build_years_tally(directory, year_list, yrange_min, yrange_max):
    years_tally = {}
    for y in year_list:
        try:
            years_tally[y] = 0
        except KeyError:
            pass
    for subdir, dirs, files in os.walk(directory):
        print("Counting number of volumes per period.")
        for jsondoc in tqdm.tqdm(files):
            if jsondoc[0] != ".":
                with open(directory + "/" + jsondoc, 'r',
                          encoding='utf-8') as in_file:
                    jsondata = json.load(in_file)
                    try:
                        year = int(jsondata["Year Published"])
                    except KeyError:
                        year = int(jsondata["Date"])
                    # check to make sure it's within range specified by user
                    if yrange_min <= year < yrange_max:
                        target = common.determine_year(year, year_list)
                        try:
                            years_tally[target] += 1
                        except KeyError:
                            pass
    return years_tally
def populate_overall_sentiment(directory, overall_list, year_list, afinn):
    overall_sent = common.build_dict_of_lists(year_list, overall_list)
    for subdir, dirs, files in os.walk(directory):
        print("Calculating sentiment across entire corpus.")
        for jsondoc in tqdm.tqdm(files):
            if jsondoc[0] != ".":
                with open(directory + "/" + jsondoc, 'r', encoding='utf8') as inpt:
                    sentiment = 0
                    jsondata = json.load(inpt)
                    text = jsondata["Filtered Text"]
                    year = int(jsondata["Year Published"])
                    # check to make sure it's within range specified by user
                    if yrange_min <= year < yrange_max:
                        # determine which period it falls within
                        target = common.determine_year(year, year_list)
                        for i in range(len(text)):
                            sentiment += afinn.score(text[i])
                            # even though overall_list only has one keyword, this looks
                            # better than just hard-coding "all" within the method
                        truncated_sentiment = float(sentiment/len(text))
                        for keyword in overall_list:
                            # append entry as tuple rather than just sentiment score
                            # so I can use sent_calcs to get average
                            overall_sent[target][keyword].append((jsondoc, truncated_sentiment))
    return overall_sent
def init_sent_doc_dict(input_dir, key_list, year_list, stopwords, yrange_min,
                       yrange_max, text_type):
    doc_dict = common.build_dict_of_lists(year_list, key_list)
    for dirs, subdirs, files in os.walk(input_dir):
        # 'subdir' corresponds to each keyword
        print("Building volumes dictionary.")
        for subdir in tqdm.tqdm(subdirs):
            for folders, subfolders, file in os.walk(dirs + "/" + subdir):
                for jsondoc in file:
                    if jsondoc[0] != ".":
                        with open(dirs + "/" + subdir + "/" + jsondoc,
                                  'r',
                                  encoding='utf8') as inpt:
                            jsondata = json.load(inpt)
                            text = jsondata[text_type]
                            # remove stopwords
                            for i in range(len(text) - 1, -1, -1):
                                # Delete empty strings
                                if text[i] in stopwords or len(text[i]) < 2:
                                    del text[i]
                            year = int(jsondata["Year Published"])
                            # check to make sure it's within range specified by user
                            if yrange_min <= year < yrange_max:
                                target = common.determine_year(year, year_list)
                                try:
                                    doc_dict[target][subdir].append(text)
                                except KeyError:
                                    pass
    return doc_dict
def calculate_idf_results(keywords, year_list, years_tally, directory,
                          yrange_min, yrange_max):
    idf_results = common.build_dict_of_nums(year_list, keywords)
    for subdir, dirs, files in os.walk(directory):
        print("Calculating IDF scores.")
        for jsondoc in tqdm.tqdm(files):
            if jsondoc[0] != ".":
                with open(directory + "/" + jsondoc, 'r',
                          encoding='utf8') as in_file:
                    jsondata = json.load(in_file)
                    text = jsondata[text_type]
                    if bigrams:
                        text = nltk.bigrams(text)
                    try:
                        year = int(jsondata["Year Published"])
                    except KeyError:
                        year = int(jsondata["Date"])
                    # check to make sure it's within range specified by user
                    if yrange_min <= year < yrange_max:
                        target = common.determine_year(year, year_list)
                        # create word frequency distribution
                        fdist = nltk.FreqDist(text)
                        for keyword in keywords:
                            if not bigrams:
                                words = keyword.split("/")
                                for w in words:
                                    # check if word occurs in document
                                    if fdist[w] > 0:
                                        try:
                                            idf_results[target][keyword] += 1
                                            break
                                        except KeyError:
                                            pass
                                    else:
                                        pass
                            else:
                                for i in range(len(keyword)):
                                    if fdist[keyword[i]] > 0:
                                        try:
                                            idf_results[target][keyword] += 1
                                            break
                                        except KeyError:
                                            pass
                                    else:
                                        pass
    for y in year_list:
        for keyword in keywords:
            try:
                # Add 1 before logarithm to ensure idf is nonzero, unless the word doesn't
                # occur at all for the period, in which case it's idf score is 0.
                if idf_results[y][keyword] > 0:
                    idf_results[y][keyword] = 1 + round(
                        math.log(
                            (years_tally[y]) / idf_results[y][keyword], 10), 4)
                else:
                    idf_results[y][keyword] = 0
            except KeyError:
                pass
    return idf_results
def keyword_and_word_count(year_list, directory, yrange_min, yrange_max,
                           keywords):
    word_totals = common.build_simple_dict_of_nums(year_list)
    word_count_dict = common.build_nested_dict_of_nums(year_list, keywords)
    # keyword_totals = common.build_dict_of_nums(year_list, keywords)
    frequency_list = common.build_dict_of_lists(year_list, keywords)
    # word_count = {}
    for subdir, dirs, files in os.walk(directory):
        print("Taking word counts")
        for jsondoc in tqdm.tqdm(files):
            if jsondoc[0] != ".":
                with open(directory + "/" + jsondoc, 'r',
                          encoding='utf8') as in_file:
                    jsondata = json.load(in_file)
                    text = jsondata[text_type]
                    if bigrams:
                        text = nltk.bigrams(text)
                    num_words = len(list(text))
                    try:
                        year = int(jsondata["Year Published"])
                    except KeyError:
                        year = int(jsondata["Date"])
                    # check to make sure it's within range specified by user
                    if yrange_min <= year < yrange_max:
                        target = common.determine_year(year, year_list)
                        fdist = nltk.FreqDist(text)
                        for keyword in keywords:
                            # keeping this here for bigrams
                            word_count = 0
                            # update keyword count for period/keyword pair
                            if not bigrams:
                                keys = keyword.split("/")
                                for k in keys:
                                    word_count += fdist[k]
                                    word_count_dict[target][keyword][
                                        k] += fdist[k]
                            else:
                                # TODO: implement same functionality above for bigrams
                                # TODO: pretty much everything for bigrams is not functional
                                for i in range(len(keyword)):
                                    word_count += fdist[keyword[i]]
                            try:
                                # add word count to frequency totals (for frequency as percentage of total words)
                                # keyword_totals[target][keyword] += word_count
                                # append word count to frequency list (for mean & variance of samples)
                                # frequency_list[target][keyword].append(word_count)
                                word_totals[target] += num_words
                                word_count_dict[target][keyword][
                                    "TOTAL"] += word_count
                                frequency_list[target][keyword].append(
                                    word_count)
                            except KeyError:
                                # decade out of range
                                pass

    return [word_count_dict, frequency_list, word_totals]
def calculate_tfidf_results(year_list, keywords, directory, idf_results,
                            yrange_min, yrange_max):
    tf_idf_results = common.build_dict_of_lists(year_list, keywords)
    for subdir, dirs, files in os.walk(directory):
        print("Calculating TF-IDF scores.")
        for jsondoc in tqdm.tqdm(files):
            if jsondoc[0] != ".":
                with open(directory + "/" + jsondoc, 'r',
                          encoding='utf8') as inpt:
                    jsondata = json.load(inpt)
                    text = jsondata[text_type]
                    if bigrams:
                        text = nltk.bigrams(text)
                    try:
                        year = int(jsondata["Year Published"])
                    except KeyError:
                        year = int(jsondata["Date"])
                    # check to make sure it's within range specified by user
                    if yrange_min <= year < yrange_max:
                        target = common.determine_year(year, year_list)
                        # create word frequency distribution
                        fdist = nltk.FreqDist(text)
                        # calculate tf and tf-idf for each keyword
                        for keyword in keywords:
                            # if single-word keywords are being searched for, then
                            # they can be grouped together, separated by a "/" character.
                            temp = 0
                            if not bigrams:
                                words = keyword.split("/")
                                for w in words:
                                    temp += calculate_tf(fdist, w)
                            else:
                                for i in range(len(keyword)):
                                    temp += calculate_tf(fdist, keyword[i])
                            try:
                                idf = idf_results[target][keyword]
                                tf_idf = calculate_tfidf(idf, temp)
                                # append tuple of document/tf-idf score pair
                                tf_idf_results[target][keyword].append(
                                    (jsondoc, tf_idf))
                            except KeyError:
                                pass
    for year in year_list:
        for keyword in keywords:
            tf_idf_results[year][keyword] = sorted(
                tf_idf_results[year][keyword], key=lambda x: x[1])
    return tf_idf_results
Exemplo n.º 7
0
def build_samples(csv_inpt, year_list, yrange_min, yrange_max):
    # set up observation and sample size dicts
    p = common.build_simple_dict_of_nums(year_list)
    n = common.build_simple_dict_of_nums(year_list)
    with open(csv_inpt, 'r') as csv_file:
        read_csv = csv.reader(csv_file, delimiter=',')
        row1 = next(read_csv)
        # this column is populated if the csv file stores word frequencies
        if row1[-1] == "total words":
            binary = False
        else:
            binary = True
        print("Building a set of samples")
        for row in tqdm.tqdm(read_csv):
            if row[0] != "filename":
                year = int(row[1])
                # check to make sure it's within range specified by user
                if yrange_min <= year < yrange_max:
                    # determine which period it falls within
                    target = common.determine_year(year, year_list)
                    try:
                        if binary:
                            # one more volume to sample size w/r/t year period
                            n[target] += 1
                        else:
                            # add total words to sample size w/r/t year period
                            n[target] += int(row[-1])
                    except KeyError:
                        pass
                    for cell in row[2:-1]:
                        if binary:
                            if cell == "1":
                                try:
                                    # add one to observation dict and break
                                    p[target] += 1
                                    break
                                except KeyError:
                                    pass
                        else:
                            try:
                                # add frequency in this cell to observation dict
                                p[target] += int(cell)
                            except KeyError:
                                pass
    return [p, n]
def populate_sent_dict(directory, key_list, year_list, afinn):
    sent_dict = common.build_dict_of_lists(year_list, key_list)
    for dirs, subdirs, files in os.walk(directory):
        # 'subdir' corresponds to a keyword
        for subdir in subdirs:
            for folders, subfolders, file in os.walk(dirs + "/" + subdir):
                for jsondoc in file:
                    if jsondoc[0] != ".":
                        with open(dirs + "/" + subdir + "/" + jsondoc, 'r', encoding='utf8') as inpt:
                            sentiment = 0
                            jsondata = json.load(inpt)
                            text = jsondata["Text"]
                            year = int(jsondata["Year Published"])
                            # check to make sure it's within range specified by user
                            if yrange_min <= year < yrange_max:
                                target = common.determine_year(year, year_list)
                                sentiment += afinn.score(text)
                                sent_dict[target][subdir].append((jsondoc, sentiment))
    sent_dict_sorted = sort_sent_dict(year_list, key_list, sent_dict)
    return sent_dict_sorted
def calculate_n_words(year_list, directory, num, yrange_min, yrange_max):
    fdist_dict = common.build_simple_dict_of_nums(year_list)
    text_lengths = common.build_simple_dict_of_nums(year_list)
    n_dict = common.build_simple_dict_of_lists(year_list)
    print("Calculating top {0} words per period".format(str(num)))
    for subdir, dirs, files in os.walk(directory):
        for jsondoc in files:
            if jsondoc[0] != ".":
                with open(directory + "/" + jsondoc, 'r',
                          encoding='utf8') as in_file:
                    jsondata = json.load(in_file)
                    try:
                        year = int(jsondata["Year Published"])
                    except KeyError:
                        year = int(jsondata["Date"])
                    text = jsondata[text_type]
                    text_len = len(text)
                    if bigrams:
                        text = nltk.bigrams(text)
                        text_len = len(text)
                    fdist = nltk.FreqDist(text)
                    if yrange_min <= year < yrange_max:
                        target = common.determine_year(year, year_list)
                        text_lengths[target] += text_len
                        if fdist_dict[target] == 0:
                            fdist_dict[target] = fdist
                        else:
                            fdist_dict[target] |= fdist
    for year in year_list:
        if num <= len(fdist_dict[year]):
            n_dict[year].extend(
                obtain_n_words(fdist_dict[year], num, text_lengths[year]))
        else:
            n_dict[year].extend(
                obtain_n_words(fdist_dict[year], len(fdist_dict[year]),
                               text_lengths[year]))
    return n_dict