示例#1
0
def getUniqueWords(subredditname):
    wordfile_path = datadirectory + "/ProcessedData/" + subredditname + "_words" + ".txt"

    set_of_words = set()
    freq_subreddit = FreqDist()

    if not path.exists(wordfile_path):
        for datafile in getTextFileNames(subredditname):
            if path.exists(datafile):
                print("reading " + datafile)
                freq_subreddit = collectFreqData(datafile) + freq_subreddit
            else:
                print("no data for " + datafile)

        for i in freq_subreddit.most_common(20):
            print(i)

        with open(wordfile_path, "a+") as wordfile:
            for word in freq_subreddit.keys():
                word = word.strip()
                word = word.lower()
                set_of_words.add(word)
                wordfile.write(word + "\n")
        return set_of_words
    else:
        with open(wordfile_path, "r") as wordfile:
            #read line by line
            print("reading " + wordfile_path)
            for word in wordfile:
                word = word.strip()
                word = word.lower()
                set_of_words.add(word)
            return set_of_words
示例#2
0
def main():
    #initalize globals
    readRegionalisms()
    addStopWords()

    #to select subreddits to process, add their name to the CurrentSubredditSet file
    toProcess = readSubredditSet()
    for corpusname in toProcess:
        print("doing, "+corpusname)
        download("subreddit-"+corpusname, data_dir=datadirectory+"/DataDownloads")

        #create the directory
        if not os.path.exists(datadirectory+"/ProcessedData/"+corpusname):
            os.makedirs(datadirectory+"/ProcessedData/"+corpusname)

        if os.path.exists(datadirectory+"/DataDownloads/subreddit-"+corpusname+".zip") and not os.path.exists(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip"):
            os.rename(datadirectory+"/DataDownloads/subreddit-"+corpusname+".zip", datadirectory+"/DataDownloads/"+corpusname+".corpus.zip")

        print(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip")
        with ZipFile(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip", mode="r") as corpuszip:
            if not os.path.exists(datadirectory+"/ProcessedData/"+corpusname+"/utterances.jsonl"):
                corpuszip.extract("utterances.jsonl", path=datadirectory+"/ProcessedData/"+corpusname+"/")


        #make the unfilted text files
        old_data_exists = False
        for file in getTextFileNames(corpusname, filtered=False):
            if os.path.exists(file):
                old_data_exists = True

        if not old_data_exists:
            convertToText(corpusname)
        else:
            print(corpusname + " has already been converted to unfiltered text files, moving on")

        # remove stopwords
        old_data_exists = False
        for file in getTextFileNames(corpusname):
            if os.path.exists(file):
                old_data_exists = True

        if not old_data_exists:
            removeStopwordsFromConverted(corpusname)
        else:
            print(corpusname + " has already had its text files filtered")
示例#3
0
def recordAudienceData(corpusname, csvwritter):
    towrite = [corpusname]
    for file in getTextFileNames(corpusname, filtered=True):
        print("recording the file: " + file)
        if path.exists(file):
            comment_with, numcomments = collectAudienceFreqData(file)
            print("num comment: " + str(numcomments))
            print("num w/ thing: " + str(comment_with))
            freq = round(comment_with / numcomments, 5)
            towrite.append(freq)
        else:
            print("no such file: " + file)
            towrite.append(0)
    csvwritter.writerow(towrite)
示例#4
0
def recordCountData(corpusname, csvwritter):
    totalFQ = FreqDist()
    processed_corpus_texts = getTextFileNames(corpusname)

    for file in processed_corpus_texts:
        print("recording the file: " + file)
        if path.exists(file):
            freqs = collectFreqData(file)
            totalFQ = freqs + totalFQ

    totalFQ["<end_comment>"] = 0

    towrite = dict()
    towrite["Subreddit"] = corpusname

    for word in getRegionalisms():
        towrite[word] = totalFQ[word]

    csvwritter.writerow(towrite)
示例#5
0
def main(prefix=""):
    readRegionalisms()
    # to select subreddits to process, add their name to the CurrentSubredditSet file
    #NOTE: actually returns a list.
    toAnalyze = readSubredditSet()
    print(toAnalyze)

    allwords = getRegionalisms().copy()
    for subreddit in toAnalyze:
        numprevwords = len(allwords)
        print("num unique words: " + str(numprevwords))
        allwords = allwords.union(getUniqueWords(subreddit))
        print(subreddit + " added " + str(len(allwords) - numprevwords) +
              " words.")

    print("in total, there are " + str(len(allwords)) +
          " words in the vector.")

    frequenceylist = list()
    for subreddit in toAnalyze:
        currentfreq = initalizeFreqDistWithKeys(allwords)
        print("num keys in frequency: " + str(len(currentfreq.keys())))
        for file in getTextFileNames(subreddit):
            currentfreq = collectFreqData(file) + currentfreq
        frequenceylist.append(currentfreq)

        print("currently, there are " + str(len(currentfreq.keys())) +
              "unique words in " + subreddit)

    idf_scores = computeIDF(frequenceylist, allwords)

    tfidf_scores = list()

    #problem, don't know what goes with what.
    for frequency in frequenceylist:
        tfidf_scores.append(computeTFIDF(frequency, idf_scores, allwords))

    csv_keys = getRegionalismsOnlyCSVKeys()
    #csv_keys = getAllWordCSVKeys(allwords)

    for i in range(len(toAnalyze)):
        recordRegionalismTFIDF(tfidf_scores[i], toAnalyze[i], csv_keys)
示例#6
0
def recordFrequencyData(corpusname, csvwritter, useLogFreq=False):
    totalFQ = FreqDist()
    processed_corpus_texts = getTextFileNames(corpusname)

    for file in processed_corpus_texts:
        print("recording the file: " + file)
        if path.exists(file):
            freqs = collectFreqData(file)
            totalFQ = freqs + totalFQ

    towrite = dict()
    towrite["Subreddit"] = corpusname

    for word in getRegionalisms():
        if totalFQ[word] == 0:
            towrite[word] = 0
        else:
            if useLogFreq:
                towrite[word] = math.log(totalFQ[word] / totalFQ.N())
            else:
                towrite[word] = totalFQ[word] / totalFQ.N()
    csvwritter.writerow(towrite)
示例#7
0
def unigramFreqFile(subreddit):
    # get filtered files
    filenames = getTextFileNames(subreddit)
    countFileName = getCountFileName(subreddit)
    with open(countFileName, "a+", errors='ignore') as countVectorFile:
        frequencies = FreqDist()
        for filename in filenames:
            print("sending normalized values of " + filename + " to " + countFileName)
            with open(filename, "r", errors="ignore") as current_file:
                for line in current_file:
                    for word in line.split():
                        word = word.strip()
                        if word.startswith("http") or word.isnumeric():
                            continue
                        if 0 < len(word) < 23:
                            frequencies[word] = frequencies.get(word, 0) + 1

        frequencies["<end_comment>"] = 0
        # write total number of words
        countVectorFile.write(str(frequencies.N()))
        for word in frequencies:
            countVectorFile.write(word+" "+str(frequencies[word])+"\n")
示例#8
0
def bigramFreqFile(subreddit):
    #get filtered files
    filenames = getTextFileNames(subreddit)
    countfilename = getCountFileName(subreddit, unigram=False)
    with open(countfilename, "a+", errors='ignore') as countVectorFile:
        frequencies = FreqDist()

        #good canidate for multithreading. one thread for file, each with own freq dist, combo after all finish.
        for filename in filenames:
            print("sending normalized values of " + filename + " to " + countfilename)
            with open(filename, "r", errors="ignore") as current_file:
                for line in current_file:
                    for bigram in list(bigrams(line.split())):
                        okayrange = 0 < len(bigram[0])  < 23 and 0 < len(bigram[1]) < 23
                        if okayrange and bigram[1] != "<end_comment>":
                            frequencies[bigram] = frequencies.get(bigram, 0) + 1

        #write total number of words
        countVectorFile.write(str(frequencies.N()))

        #note, another good improvement, organize this for faster searching.
        for bigram in frequencies:
            countVectorFile.write(" ".join(bigram)+" "+str(frequencies[bigram]))
示例#9
0
def analyzeAudienceData(corpuslist, outfilename="results_audience", prefix=""):
    if len(prefix) > 0:
        datafilepath = datadirectory + "/results/" + prefix + outfilename + ".csv"
    else:
        datafilepath = datadirectory + "/results/" + outfilename + ".csv"
    if not path.exists(datafilepath):
        # create an empty file
        open(datafilepath, "x").close()
        csvfile = open(datafilepath, "a", newline='')
        csvwriter = csv.writer(csvfile, dialect='excel')

        # see getUnfilteredTextFilename for order
        csvwriter.writerow([
            "Subreddit", "Post", "Post with Mention", "Comment ",
            "Comment with Mention"
        ])
    else:
        csvfile = open(datafilepath, "a", newline='')
        csvwriter = csv.writer(csvfile, dialect='excel')

    for corpus_name in corpuslist:
        files = getTextFileNames(corpus_name)
        recordAudienceData(corpus_name, csvwriter)
    csvfile.close()
示例#10
0
def removeStopwordsFromConverted(corpusname):
    for file in getTextFileNames(corpusname, filtered=False):
        if path.exists(file):
            removestopwords(file)
    return
示例#11
0
def recordStatsData(corpusname, csvwritter):
    totalFQ = FreqDist()
    processed_corpus_texts = getTextFileNames(corpusname, filtered=False)

    numcomments_pnm = 0
    numcomments_pm = 0
    numcomments_cm = 0
    numcomments_cnm = 0

    # post no mention
    if path.exists(processed_corpus_texts[0]):
        print("reading: " + processed_corpus_texts[0])
        freqs_pnm = collectFreqData(processed_corpus_texts[0])
        totalFQ = totalFQ + freqs_pnm
        junk, numcomments_pnm = collectAudienceFreqData(
            processed_corpus_texts[0])
    # post mention
    if path.exists(processed_corpus_texts[1]):
        print("reading: " + processed_corpus_texts[1])
        freqs_pm = collectFreqData(processed_corpus_texts[1])
        totalFQ = totalFQ + freqs_pm
        junk, numcomments_pm = collectAudienceFreqData(
            processed_corpus_texts[1])
    # comment no mention
    if path.exists(processed_corpus_texts[2]):
        print("reading: " + processed_corpus_texts[2])
        freqs_cnm = collectFreqData(processed_corpus_texts[2])
        totalFQ = totalFQ + freqs_cnm
        junk, numcomments_cnm = collectAudienceFreqData(
            processed_corpus_texts[2])
    # comment mention
    if path.exists(processed_corpus_texts[3]):
        print("reading: " + processed_corpus_texts[3])
        freqs_cm = collectFreqData(processed_corpus_texts[3])
        totalFQ = totalFQ + freqs_cm
        junk, numcomments_cm = collectAudienceFreqData(
            processed_corpus_texts[3])

    print("writing")

    towrite = dict()
    towrite["Subreddit"] = corpusname
    towrite["N"] = totalFQ.N()
    towrite["B"] = totalFQ.B()
    towrite[
        "Num Utterences"] = numcomments_pnm + numcomments_pm + numcomments_cm + numcomments_cnm
    towrite["Num Utterences - Post NM"] = numcomments_pnm
    towrite["Num Utterences - Post M"] = numcomments_pm
    towrite["Num Utterences - Comment"] = numcomments_cnm
    towrite["Num Utterences - Comment M"] = numcomments_cm

    if path.exists(processed_corpus_texts[0]):
        towrite["N-Post"] = freqs_pnm.N()
        towrite["B-Post"] = freqs_pnm.B()
    else:
        towrite["N-Post"] = 0
        towrite["B-Post"] = 0

    if path.exists(processed_corpus_texts[1]):
        towrite["N-Post with Mention"] = freqs_pm.N()
        towrite["B-Post with Mention"] = freqs_pm.B()
    else:
        towrite["N-Post with Mention"] = 0
        towrite["B-Post with Mention"] = 0

    if path.exists(processed_corpus_texts[2]):
        towrite["N -Comment"] = freqs_cnm.N()
        towrite["B -Comment"] = freqs_cnm.B()
    else:
        towrite["N -Comment"] = 0
        towrite["B -Comment"] = 0

    if path.exists(processed_corpus_texts[3]):
        towrite["N -Comment with Mention"] = freqs_cm.N()
        towrite["B -Comment with Mention"] = freqs_cm.B()
    else:
        towrite["N -Comment with Mention"] = 0
        towrite["B -Comment with Mention"] = 0
    csvwritter.writerow(towrite)