def TFIDFFreq(singleText, freqTextcollection, freqSingleText): # calculate TFIDF using values from the frequency collection - the text converted to 2 letters words of the rarest # frequency TFIDF = _TFCalculteHebrewFreq(singleText, freqSingleText) # use the idf value of the frequency word for each in TFIDF: TFIDF[each] *= _IDFCalculate(freqTextcollection, Parshiot.processWordByFrequency(each)) return TFIDF
def _TFCalculteHebrewFreq(full_text_array, freq_text_array): # hebrew text is already tokenized from Parshiot.py # count the frequencies in the 2 letter words TF = Counter(freq_text_array) # divide by total number of words textLength = len(freq_text_array) for each in TF: TF[each] /= textLength # create a counter for the full words. Set each TF to the TF of the 2 letter word calculated before TFFinal = Counter(full_text_array) for word in TFFinal: freqWord = Parshiot.processWordByFrequency(word) TFFinal[word] = TF[freqWord] # return the TF dictionary containing each word and its relative frequency return TFFinal
def parshaFreqIDF(parshaName, parshiot, freqParshiot): freqTFIDF = TFIDF(freqParshiot[parshaName], freqParshiot, 'hebrew') regTFIDF = Counter(parshiot[parshaName]) for word,value in regTFIDF.items(): regTFIDF[word] = freqTFIDF[Parshiot.processWordByFrequency(word)] return regTFIDF