def outputResults(tfidf, score_range=(0.000004, 0.00007), prefix="ignore"): subredditname = tfidf[0] scores = tfidf[1] regionalisms = getRegionalisms() output = dict() output["Subreddit"] = subredditname words = set() for word in scores: if score_range[0] < scores.get(word, 0) < score_range[1]: words.add(word) output[word] = scores[word] if word in regionalisms and scores[word] > 0: print(subredditname + " has " + word + " with score " + str(scores[word])) #check if prefix should be included if len(prefix) > 1: datafilepath = datafolder + "/results/" + prefix + subredditname + "_results_tfidf.csv" else: datafilepath = datafolder + "/results/" + subredditname + "_results_tfidf.csv" if not path.exists(datafilepath): open(datafilepath, "x").close() with open(datafilepath, "a+", newline='') as csvfile: fieldnames = list() fieldnames.append("Subreddit") fieldnames.extend(words) csvwriter = csv.DictWriter(csvfile, fieldnames, dialect='excel') csvwriter.writeheader() csvwriter.writerow(output)
def analyzeCountData(corpuslist, outfilename="results_counts", prefix=""): if len(prefix) > 0: datafilepath = datadirectory + "/results/" + prefix + outfilename + ".csv" else: datafilepath = datadirectory + "/results/" + outfilename + ".csv" fieldnames = ["Subreddit"] for word in getRegionalisms(): fieldnames.append(word) print(fieldnames) if not path.exists(datafilepath): # create an empty file open(datafilepath, "x").close() csvfile = open(datafilepath, "a", newline='') csvwriter = csv.DictWriter(csvfile, fieldnames, restval=0, dialect='excel') csvwriter.writeheader() else: csvfile = open(datafilepath, "a", newline='') csvwriter = csv.DictWriter(csvfile, fieldnames, restval=0, dialect='excel') for corpus in corpuslist: recordCountData(corpus, csvwriter) csvfile.close()
def checkComment(comment): line = comment.split() has_regionalism = False regionalisms = getRegionalisms() for word in line: if word in regionalisms: has_regionalism = True break return has_regionalism
def main(frequencyCSV="results_frequencys.csv"): # based off of https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76 # edited to suit my needs here readRegionalisms() frequencyCSV_path = datafolder + "/Results/" + frequencyCSV regionalisms = getRegionalisms() idfscore = dict().fromkeys(regionalisms, 0) subreddit_data = list() with open(frequencyCSV_path, "r") as frequencyCSV_file: csvreader = csv.DictReader(frequencyCSV_file, delimiter=",", dialect='excel') for row in csvreader: subreddit_data.append(row) print(csvreader.fieldnames) #should now have a thing of frequencies. #calc idf: N = len(subreddit_data) print("analyzing " + str(N) + " subreddits.") for word in regionalisms: if word == "Subreddit": continue idfscore[word] = calculateIDF(subreddit_data, word, N) output = list() for i in range(N): current_freq = subreddit_data[i] print(current_freq.get("Subreddit")) tfidf_score_current = dict().fromkeys(regionalisms, 0) for word in set(regionalisms).intersection(current_freq.keys()): print(str(current_freq.get(word)) + " " + str(idfscore.get(word))) tfidf_score_current[word] = float( current_freq.get(word)) * idfscore.get( word) #frequency in doc * idf[word] tfidf_score_current["Subreddit"] = current_freq.get("Subreddit") output.append(tfidf_score_current) #write the data datafilepath = datafolder + "/results/results_tfidf.csv" open(datafilepath, "x").close() csvfile = open(datafilepath, "a+", newline='') fieldnames = list() fieldnames.append("Subreddit") fieldnames.extend(regionalisms) csvwriter = csv.DictWriter(csvfile, fieldnames, dialect='excel') csvwriter.writeheader() for subdata in output: print(subdata) csvwriter.writerow(subdata)
def addStopWords(extrastopfile="../../data/supplementalremovedwords.txt"): global stopwords readRegionalisms() regionalisms = getRegionalisms() extrastopfile = open(extrastopfile, "r+") extrastopfile_text = extrastopfile.read() extrastopfile.close() # filter out regionalims from the stop words stopwords.union(set(extrastopfile_text.split())) #avoid filtering out part of a regionalism if it's two words for word in regionalisms: regionalisms.union(set(word.split())) stopwords.difference_update(regionalisms)
def recordCountData(corpusname, csvwritter): totalFQ = FreqDist() processed_corpus_texts = getTextFileNames(corpusname) for file in processed_corpus_texts: print("recording the file: " + file) if path.exists(file): freqs = collectFreqData(file) totalFQ = freqs + totalFQ totalFQ["<end_comment>"] = 0 towrite = dict() towrite["Subreddit"] = corpusname for word in getRegionalisms(): towrite[word] = totalFQ[word] csvwritter.writerow(towrite)
def main(prefix=""): readRegionalisms() # to select subreddits to process, add their name to the CurrentSubredditSet file #NOTE: actually returns a list. toAnalyze = readSubredditSet() print(toAnalyze) allwords = getRegionalisms().copy() for subreddit in toAnalyze: numprevwords = len(allwords) print("num unique words: " + str(numprevwords)) allwords = allwords.union(getUniqueWords(subreddit)) print(subreddit + " added " + str(len(allwords) - numprevwords) + " words.") print("in total, there are " + str(len(allwords)) + " words in the vector.") frequenceylist = list() for subreddit in toAnalyze: currentfreq = initalizeFreqDistWithKeys(allwords) print("num keys in frequency: " + str(len(currentfreq.keys()))) for file in getTextFileNames(subreddit): currentfreq = collectFreqData(file) + currentfreq frequenceylist.append(currentfreq) print("currently, there are " + str(len(currentfreq.keys())) + "unique words in " + subreddit) idf_scores = computeIDF(frequenceylist, allwords) tfidf_scores = list() #problem, don't know what goes with what. for frequency in frequenceylist: tfidf_scores.append(computeTFIDF(frequency, idf_scores, allwords)) csv_keys = getRegionalismsOnlyCSVKeys() #csv_keys = getAllWordCSVKeys(allwords) for i in range(len(toAnalyze)): recordRegionalismTFIDF(tfidf_scores[i], toAnalyze[i], csv_keys)
def removestopwords(filename): global stopwords if stopwords == None: stopwords = set(stopwords.words('english')) addStopWords() nameinsert_index = filename.rfind("/") print("sending normalized values of " + filename + " to "+ filename[:nameinsert_index+1] + "filtered_" + filename[nameinsert_index+1:]) filtered = open(filename[:nameinsert_index+1] + "filtered_" + filename[nameinsert_index+1:], "a+", errors='ignore') with open(filename, "r") as current_file: regionalisms = getRegionalisms() for line in current_file: if len(line) > 0 and not line.startswith("[deleted]") : ##hopefully not a new line line = line.split() numwords = 0; current_linewrite = list() for word in line: #remove urls #"(" ")" word = re.sub(r'\((https|http)?:\/\/(S)*\)', '', word) word = re.sub(r'\/(r|R)\/(S)*\s', '', word) if len(word) < 1: continue # https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string word_noPunctuation = word.translate( str.maketrans('', '', string.punctuation)) if not word_noPunctuation in stopwords: if word in regionalisms or word == "<end_comment>": current_linewrite.append(word) numwords = numwords + 1 else: current_linewrite.append(word_noPunctuation) numwords = numwords + 1 if numwords > 1: filtered.write(" ".join(current_linewrite) + "\n") filtered.close()
def recordFrequencyData(corpusname, csvwritter, useLogFreq=False): totalFQ = FreqDist() processed_corpus_texts = getTextFileNames(corpusname) for file in processed_corpus_texts: print("recording the file: " + file) if path.exists(file): freqs = collectFreqData(file) totalFQ = freqs + totalFQ towrite = dict() towrite["Subreddit"] = corpusname for word in getRegionalisms(): if totalFQ[word] == 0: towrite[word] = 0 else: if useLogFreq: towrite[word] = math.log(totalFQ[word] / totalFQ.N()) else: towrite[word] = totalFQ[word] / totalFQ.N() csvwritter.writerow(towrite)
def getRegionalismsOnlyCSVKeys(): fieldnames = ["Subreddit"] fieldnames.extend(getRegionalisms()) return fieldnames