示例#1
0
def outputResults(tfidf, score_range=(0.000004, 0.00007), prefix="ignore"):
    subredditname = tfidf[0]
    scores = tfidf[1]
    regionalisms = getRegionalisms()

    output = dict()
    output["Subreddit"] = subredditname

    words = set()
    for word in scores:
        if score_range[0] < scores.get(word, 0) < score_range[1]:
            words.add(word)
            output[word] = scores[word]

        if word in regionalisms and scores[word] > 0:
            print(subredditname + " has " + word + " with score " +
                  str(scores[word]))

    #check if prefix should be included
    if len(prefix) > 1:
        datafilepath = datafolder + "/results/" + prefix + subredditname + "_results_tfidf.csv"
    else:
        datafilepath = datafolder + "/results/" + subredditname + "_results_tfidf.csv"

    if not path.exists(datafilepath):
        open(datafilepath, "x").close()

    with open(datafilepath, "a+", newline='') as csvfile:
        fieldnames = list()
        fieldnames.append("Subreddit")
        fieldnames.extend(words)
        csvwriter = csv.DictWriter(csvfile, fieldnames, dialect='excel')
        csvwriter.writeheader()
        csvwriter.writerow(output)
示例#2
0
def analyzeCountData(corpuslist, outfilename="results_counts", prefix=""):
    if len(prefix) > 0:
        datafilepath = datadirectory + "/results/" + prefix + outfilename + ".csv"
    else:
        datafilepath = datadirectory + "/results/" + outfilename + ".csv"

    fieldnames = ["Subreddit"]

    for word in getRegionalisms():
        fieldnames.append(word)
    print(fieldnames)

    if not path.exists(datafilepath):
        # create an empty file
        open(datafilepath, "x").close()
        csvfile = open(datafilepath, "a", newline='')
        csvwriter = csv.DictWriter(csvfile,
                                   fieldnames,
                                   restval=0,
                                   dialect='excel')
        csvwriter.writeheader()

    else:
        csvfile = open(datafilepath, "a", newline='')
        csvwriter = csv.DictWriter(csvfile,
                                   fieldnames,
                                   restval=0,
                                   dialect='excel')

    for corpus in corpuslist:
        recordCountData(corpus, csvwriter)
    csvfile.close()
示例#3
0
def checkComment(comment):
    line = comment.split()
    has_regionalism = False

    regionalisms = getRegionalisms()

    for word in line:
        if word in regionalisms:
            has_regionalism = True
            break
    return has_regionalism
def main(frequencyCSV="results_frequencys.csv"):
    # based off of https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76
    # edited to suit my needs here
    readRegionalisms()
    frequencyCSV_path = datafolder + "/Results/" + frequencyCSV
    regionalisms = getRegionalisms()
    idfscore = dict().fromkeys(regionalisms, 0)
    subreddit_data = list()
    with open(frequencyCSV_path, "r") as frequencyCSV_file:
        csvreader = csv.DictReader(frequencyCSV_file,
                                   delimiter=",",
                                   dialect='excel')
        for row in csvreader:
            subreddit_data.append(row)
        print(csvreader.fieldnames)

    #should now have a thing of frequencies.
    #calc idf:
    N = len(subreddit_data)
    print("analyzing " + str(N) + " subreddits.")
    for word in regionalisms:
        if word == "Subreddit":
            continue

        idfscore[word] = calculateIDF(subreddit_data, word, N)

    output = list()
    for i in range(N):
        current_freq = subreddit_data[i]
        print(current_freq.get("Subreddit"))
        tfidf_score_current = dict().fromkeys(regionalisms, 0)
        for word in set(regionalisms).intersection(current_freq.keys()):
            print(str(current_freq.get(word)) + " " + str(idfscore.get(word)))
            tfidf_score_current[word] = float(
                current_freq.get(word)) * idfscore.get(
                    word)  #frequency in doc * idf[word]

        tfidf_score_current["Subreddit"] = current_freq.get("Subreddit")
        output.append(tfidf_score_current)

    #write the data
    datafilepath = datafolder + "/results/results_tfidf.csv"
    open(datafilepath, "x").close()
    csvfile = open(datafilepath, "a+", newline='')
    fieldnames = list()
    fieldnames.append("Subreddit")
    fieldnames.extend(regionalisms)
    csvwriter = csv.DictWriter(csvfile, fieldnames, dialect='excel')
    csvwriter.writeheader()

    for subdata in output:
        print(subdata)
        csvwriter.writerow(subdata)
示例#5
0
def addStopWords(extrastopfile="../../data/supplementalremovedwords.txt"):
    global stopwords
    readRegionalisms()
    regionalisms = getRegionalisms()

    extrastopfile = open(extrastopfile, "r+")
    extrastopfile_text = extrastopfile.read()
    extrastopfile.close()
    # filter out regionalims from the stop words
    stopwords.union(set(extrastopfile_text.split()))

    #avoid filtering out part of a regionalism if it's two words
    for word in regionalisms:
        regionalisms.union(set(word.split()))
        
    stopwords.difference_update(regionalisms)
示例#6
0
def recordCountData(corpusname, csvwritter):
    totalFQ = FreqDist()
    processed_corpus_texts = getTextFileNames(corpusname)

    for file in processed_corpus_texts:
        print("recording the file: " + file)
        if path.exists(file):
            freqs = collectFreqData(file)
            totalFQ = freqs + totalFQ

    totalFQ["<end_comment>"] = 0

    towrite = dict()
    towrite["Subreddit"] = corpusname

    for word in getRegionalisms():
        towrite[word] = totalFQ[word]

    csvwritter.writerow(towrite)
示例#7
0
def main(prefix=""):
    readRegionalisms()
    # to select subreddits to process, add their name to the CurrentSubredditSet file
    #NOTE: actually returns a list.
    toAnalyze = readSubredditSet()
    print(toAnalyze)

    allwords = getRegionalisms().copy()
    for subreddit in toAnalyze:
        numprevwords = len(allwords)
        print("num unique words: " + str(numprevwords))
        allwords = allwords.union(getUniqueWords(subreddit))
        print(subreddit + " added " + str(len(allwords) - numprevwords) +
              " words.")

    print("in total, there are " + str(len(allwords)) +
          " words in the vector.")

    frequenceylist = list()
    for subreddit in toAnalyze:
        currentfreq = initalizeFreqDistWithKeys(allwords)
        print("num keys in frequency: " + str(len(currentfreq.keys())))
        for file in getTextFileNames(subreddit):
            currentfreq = collectFreqData(file) + currentfreq
        frequenceylist.append(currentfreq)

        print("currently, there are " + str(len(currentfreq.keys())) +
              "unique words in " + subreddit)

    idf_scores = computeIDF(frequenceylist, allwords)

    tfidf_scores = list()

    #problem, don't know what goes with what.
    for frequency in frequenceylist:
        tfidf_scores.append(computeTFIDF(frequency, idf_scores, allwords))

    csv_keys = getRegionalismsOnlyCSVKeys()
    #csv_keys = getAllWordCSVKeys(allwords)

    for i in range(len(toAnalyze)):
        recordRegionalismTFIDF(tfidf_scores[i], toAnalyze[i], csv_keys)
示例#8
0
def removestopwords(filename):
    global stopwords
    if stopwords == None:
        stopwords = set(stopwords.words('english'))
        addStopWords()
        
    nameinsert_index = filename.rfind("/")
    print("sending normalized values of " + filename + " to "+ filename[:nameinsert_index+1] + "filtered_" + filename[nameinsert_index+1:])
    filtered = open(filename[:nameinsert_index+1] + "filtered_" + filename[nameinsert_index+1:], "a+",  errors='ignore')

    with open(filename, "r") as current_file:
        regionalisms = getRegionalisms()
        for line in current_file:
            if len(line) > 0 and not line.startswith("[deleted]") :
                ##hopefully not a new line
                line = line.split()
                numwords = 0;
                current_linewrite = list()
                for word in line:

                    #remove urls
                    #"(" ")"
                    word = re.sub(r'\((https|http)?:\/\/(S)*\)', '', word)
                    word = re.sub(r'\/(r|R)\/(S)*\s', '', word)

                    if len(word) < 1:
                        continue

                    # https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
                    word_noPunctuation = word.translate( str.maketrans('', '', string.punctuation))
                    if not word_noPunctuation in stopwords:
                        if word in regionalisms or word == "<end_comment>":
                            current_linewrite.append(word)
                            numwords = numwords + 1
                        else:
                            current_linewrite.append(word_noPunctuation)
                            numwords = numwords + 1
                if numwords > 1:
                    filtered.write(" ".join(current_linewrite) + "\n")
    filtered.close()
示例#9
0
def recordFrequencyData(corpusname, csvwritter, useLogFreq=False):
    totalFQ = FreqDist()
    processed_corpus_texts = getTextFileNames(corpusname)

    for file in processed_corpus_texts:
        print("recording the file: " + file)
        if path.exists(file):
            freqs = collectFreqData(file)
            totalFQ = freqs + totalFQ

    towrite = dict()
    towrite["Subreddit"] = corpusname

    for word in getRegionalisms():
        if totalFQ[word] == 0:
            towrite[word] = 0
        else:
            if useLogFreq:
                towrite[word] = math.log(totalFQ[word] / totalFQ.N())
            else:
                towrite[word] = totalFQ[word] / totalFQ.N()
    csvwritter.writerow(towrite)
示例#10
0
def getRegionalismsOnlyCSVKeys():
    fieldnames = ["Subreddit"]
    fieldnames.extend(getRegionalisms())
    return fieldnames