def filterBazaakParshaReadTFIDF(parshaName, lang='heb', min_count=MIN_WORD_COUNT, splitParshiot=None, min_distance=MIN_DISTANCE): if not splitParshiot: splitParshiot = Parshiot.createSplitParshiot(lang) topTFIDF = TFIDF.parshaIDF(parshaName, splitParshiot) totalWords = len(topTFIDF) # find the percentage needed percent = PERCENT / 100 topTFIDF = topTFIDF.most_common(int(totalWords * percent)) # just get the keys, the words topTFIDF = [i[0] for i in topTFIDF] parsha = splitParshiot[parshaName] read = BazaakRead(parsha, min_count, min_distance) # create a new dictionary, only containing results where the key was in the top PERCENT% of TF-IDF scores newRead = {k: v for k, v in read.items() if k in topTFIDF} return newRead
def freqBazaakParshaRead(parshaName, freqParshiot=None, min_count=MIN_WORD_COUNT, min_distance=MIN_DISTANCE): if not freqParshiot: freqParshiot = Parshiot.processParshiotByFrequency() parsha = freqParshiot[parshaName] return BazaakRead(parsha, min_count, min_distance)
def BazaakParshaRead(parshaName, lang='heb', min_count=MIN_WORD_COUNT, splitParshiot=None, min_distance=MIN_DISTANCE): if not splitParshiot: splitParshiot = Parshiot.createSplitParshiot(lang) parsha = splitParshiot[parshaName] return BazaakRead(parsha, min_count, min_distance)
def BazaakAll(lang='heb', min_count=5, min_distance=80, filtered=False, strippedDown=True): if strippedDown: parshiot = Parshiot.processParshiotByFrequency() else: parshiot = Parshiot.createSplitParshiot(lang) parshaResults = {} if filtered: # filter with TF-IDF results for parsha in parshaNames: parshaResults[parsha] = BazaakRead.filterBazaakParshaReadTFIDF( parsha, lang, min_count, parshiot, min_distance) else: for parsha in parshaNames: parshaResults[parsha] = BazaakRead.BazaakParshaRead( parsha, lang, min_count, parshiot, min_distance) return parshaResults
def TFIDFFreq(singleText, freqTextcollection, freqSingleText): # calculate TFIDF using values from the frequency collection - the text converted to 2 letters words of the rarest # frequency TFIDF = _TFCalculteHebrewFreq(singleText, freqSingleText) # use the idf value of the frequency word for each in TFIDF: TFIDF[each] *= _IDFCalculate(freqTextcollection, Parshiot.processWordByFrequency(each)) return TFIDF
def BazaakOutput(lang='heb', min_count=5, parshaResults=None, fileName=None): if not fileName: fileName = subDir + lang + 'Bazaak' + 'Output' + '.csv' parshaNames = Parshiot.parshaNames() if not parshaResults: parshaResults = BazaakAll(lang, min_count) with open(fileName, mode='w', encoding='utf-8') as csv_file: fieldnames = ['parsha', 'repeated words'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() for parsha in parshaNames: writer.writerow({ 'parsha': parsha, 'repeated words': list(parshaResults[parsha].keys()) })
def _TFCalculteHebrewFreq(full_text_array, freq_text_array): # hebrew text is already tokenized from Parshiot.py # count the frequencies in the 2 letter words TF = Counter(freq_text_array) # divide by total number of words textLength = len(freq_text_array) for each in TF: TF[each] /= textLength # create a counter for the full words. Set each TF to the TF of the 2 letter word calculated before TFFinal = Counter(full_text_array) for word in TFFinal: freqWord = Parshiot.processWordByFrequency(word) TFFinal[word] = TF[freqWord] # return the TF dictionary containing each word and its relative frequency return TFFinal
def parshaFreqIDF(parshaName, parshiot, freqParshiot): freqTFIDF = TFIDF(freqParshiot[parshaName], freqParshiot, 'hebrew') regTFIDF = Counter(parshiot[parshaName]) for word,value in regTFIDF.items(): regTFIDF[word] = freqTFIDF[Parshiot.processWordByFrequency(word)] return regTFIDF
import Parshiot, BazaakRead import csv import numpy as np import matplotlib.pyplot as plt hebResults = None engResults = None parshaNames = Parshiot.parshaNames() subDir = 'Results\\' # write bazaak results to a CSV file with parsha as column one and list of words as column 2 # option to pass in parshaResults (helpful if already generated, such as in the main here def BazaakOutput(lang='heb', min_count=5, parshaResults=None, fileName=None): if not fileName: fileName = subDir + lang + 'Bazaak' + 'Output' + '.csv' parshaNames = Parshiot.parshaNames() if not parshaResults: parshaResults = BazaakAll(lang, min_count) with open(fileName, mode='w', encoding='utf-8') as csv_file: fieldnames = ['parsha', 'repeated words'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() for parsha in parshaNames: writer.writerow({ 'parsha': parsha, 'repeated words': list(parshaResults[parsha].keys())