Пример #1
0
def filterBazaakParshaReadTFIDF(parshaName,
                                lang='heb',
                                min_count=MIN_WORD_COUNT,
                                splitParshiot=None,
                                min_distance=MIN_DISTANCE):
    if not splitParshiot:
        splitParshiot = Parshiot.createSplitParshiot(lang)

    topTFIDF = TFIDF.parshaIDF(parshaName, splitParshiot)
    totalWords = len(topTFIDF)

    # find the percentage needed
    percent = PERCENT / 100

    topTFIDF = topTFIDF.most_common(int(totalWords * percent))

    # just get the keys, the words
    topTFIDF = [i[0] for i in topTFIDF]

    parsha = splitParshiot[parshaName]
    read = BazaakRead(parsha, min_count, min_distance)

    # create a new dictionary, only containing results where the key was in the top PERCENT% of TF-IDF scores
    newRead = {k: v for k, v in read.items() if k in topTFIDF}

    return newRead
Пример #2
0
def freqBazaakParshaRead(parshaName,
                         freqParshiot=None,
                         min_count=MIN_WORD_COUNT,
                         min_distance=MIN_DISTANCE):
    if not freqParshiot:
        freqParshiot = Parshiot.processParshiotByFrequency()
    parsha = freqParshiot[parshaName]
    return BazaakRead(parsha, min_count, min_distance)
Пример #3
0
def BazaakParshaRead(parshaName,
                     lang='heb',
                     min_count=MIN_WORD_COUNT,
                     splitParshiot=None,
                     min_distance=MIN_DISTANCE):
    if not splitParshiot:
        splitParshiot = Parshiot.createSplitParshiot(lang)
    parsha = splitParshiot[parshaName]
    return BazaakRead(parsha, min_count, min_distance)
Пример #4
0
def BazaakAll(lang='heb',
              min_count=5,
              min_distance=80,
              filtered=False,
              strippedDown=True):
    if strippedDown:
        parshiot = Parshiot.processParshiotByFrequency()
    else:
        parshiot = Parshiot.createSplitParshiot(lang)
    parshaResults = {}
    if filtered:  # filter with TF-IDF results
        for parsha in parshaNames:
            parshaResults[parsha] = BazaakRead.filterBazaakParshaReadTFIDF(
                parsha, lang, min_count, parshiot, min_distance)
    else:
        for parsha in parshaNames:
            parshaResults[parsha] = BazaakRead.BazaakParshaRead(
                parsha, lang, min_count, parshiot, min_distance)
    return parshaResults
Пример #5
0
def TFIDFFreq(singleText, freqTextcollection, freqSingleText):

    # calculate TFIDF using values from the frequency collection - the text converted to 2 letters words of the rarest
    # frequency

    TFIDF = _TFCalculteHebrewFreq(singleText, freqSingleText)

    # use the idf value of the frequency word
    for each in TFIDF:
        TFIDF[each] *= _IDFCalculate(freqTextcollection, Parshiot.processWordByFrequency(each))
    return TFIDF
Пример #6
0
def BazaakOutput(lang='heb', min_count=5, parshaResults=None, fileName=None):
    if not fileName:
        fileName = subDir + lang + 'Bazaak' + 'Output' + '.csv'
    parshaNames = Parshiot.parshaNames()
    if not parshaResults:
        parshaResults = BazaakAll(lang, min_count)

    with open(fileName, mode='w', encoding='utf-8') as csv_file:
        fieldnames = ['parsha', 'repeated words']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()
        for parsha in parshaNames:
            writer.writerow({
                'parsha':
                parsha,
                'repeated words':
                list(parshaResults[parsha].keys())
            })
Пример #7
0
def _TFCalculteHebrewFreq(full_text_array, freq_text_array):
    # hebrew text is already tokenized from Parshiot.py

    # count the frequencies in the 2 letter words
    TF = Counter(freq_text_array)

    # divide by total number of words
    textLength = len(freq_text_array)
    for each in TF:
        TF[each] /= textLength

    # create a counter for the full words. Set each TF to the TF of the 2 letter word calculated before
    TFFinal = Counter(full_text_array)

    for word in TFFinal:
        freqWord = Parshiot.processWordByFrequency(word)
        TFFinal[word] = TF[freqWord]

    # return the TF dictionary containing each word and its relative frequency
    return TFFinal
Пример #8
0
def parshaFreqIDF(parshaName, parshiot, freqParshiot):
    freqTFIDF = TFIDF(freqParshiot[parshaName], freqParshiot, 'hebrew')
    regTFIDF = Counter(parshiot[parshaName])
    for word,value in regTFIDF.items():
        regTFIDF[word] = freqTFIDF[Parshiot.processWordByFrequency(word)]
    return regTFIDF
Пример #9
0
import Parshiot, BazaakRead
import csv
import numpy as np
import matplotlib.pyplot as plt

hebResults = None
engResults = None
parshaNames = Parshiot.parshaNames()
subDir = 'Results\\'


# write bazaak results to a CSV file with parsha as column one and list of words as column 2
# option to pass in parshaResults (helpful if already generated, such as in the main here
def BazaakOutput(lang='heb', min_count=5, parshaResults=None, fileName=None):
    if not fileName:
        fileName = subDir + lang + 'Bazaak' + 'Output' + '.csv'
    parshaNames = Parshiot.parshaNames()
    if not parshaResults:
        parshaResults = BazaakAll(lang, min_count)

    with open(fileName, mode='w', encoding='utf-8') as csv_file:
        fieldnames = ['parsha', 'repeated words']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()
        for parsha in parshaNames:
            writer.writerow({
                'parsha':
                parsha,
                'repeated words':
                list(parshaResults[parsha].keys())