Python loadStopWords примеры использования

Язык программирования: Python

Пространство имен/Пакет: DatabaseParser.views

Метод/Функция: loadStopWords

Примеров на hotexamples.com: 2

Python loadStopWords - 2 примера найдено. Это лучшие примеры Python кода для DatabaseParser.views.loadStopWords, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: wKNN.py Проект: MachineLearningAPI/ResearchArticleClassifier

def wKNN(filename,k):
    NUM_OF_DOCS = DocClass.objects.count()
    FREQ_THRESHOLD = NUM_OF_DOCS*FREQ_THRESHOLD_PERCENT
    #get all rows with frequency more than threshold
    docInstances = DocFreqTable.objects.filter(docFreq__lte = FREQ_THRESHOLD)
    wordList = []
    for inst in docInstances:
        wordList.append(getASCII(inst.word))
    #Now start reading the file and then parse it match if the keywords are present in the file
    #print(wordList)
    filePointer = open(filename,encoding="latin-1")
    #Push the keywords in the file to a dictionary datastructure
    fileKeyWords = {}
    wordsInFile = filePointer.read().split()
    cnt = 0
    stopwords=loadStopWords(STOP_WORD_LIST_FILENAME)
    for word in wordsInFile:
        if getASCII(word) in stopwords.keys():
            continue
        fileKeyWords[getASCII(word)] = True
        cnt += 1
        if cnt == MAX_NUM_OF_WORDS_READ:
            break
    #Get the tupple for the current file
    ##print(fileKeyWords)
    inputFileTupple = getTuple(wordList,fileKeyWords)
    ##print(inputFileTupple.list)
    docListInstance = DocClass.objects.all()
    docList = []
    # to get back the class name for any given document (assuming the number of documnets can be stored in the main mememory)
    docToClassName = {} 
    for inst in docListInstance:
        docList.append(inst.docName)
        docToClassName[inst.docName] = inst.className
    numRows = len(docList)
    numCols = len(wordList)
    knnMat = [[0]*numCols for i in range(numRows)]
    rowNumber = 0
    for doc in docList:
        docWordInstances = WordTable.objects.filter(docName=doc)
        docKeyWords = {}
        for wordInstance in docWordInstances:
            docKeyWords[wordInstance.word] = True
        oneRow = getTuple(wordList,docKeyWords).list
        x = 0
        while x<numCols:
            knnMat[rowNumber][x] = oneRow[x]
            x += 1
        rowNumber += 1
    #get all the distance from the doc
    distanceClassList = []
    i = 0
    while i < numRows:
        newTuple = Tuple()
        newTuple.list = knnMat[i]
        newTuple.count = numCols
        dis  = inputFileTupple.distance(newTuple)
        distanceClassList.append((dis,docToClassName[docList[i]]))
        i += 1
    distanceClassList.sort()
    #print(distanceClassList)
    classCount = {}
    cnt = 0
    for ins in distanceClassList:
        wt = 1/(ins[0]+0.01) #for removing the zero error
        #print(str(ins[1])+' -> '+str(wt))
        if ins[1] in classCount.keys():
            classCount[ins[1]] += wt
        else:
            classCount[ins[1]] = wt
        if cnt == k:
            break;
        cnt += 1
    #print(str(classCount))
    classCountList = []
    for className in classCount.keys():
        classCountList.append((-1*classCount[className],className))
    classCountList.sort()
    answer_list = []
    len_classCountList = len(classCountList)
    sum_rows = 0
    i = 0
    while i < len_classCountList:
        sum_rows += classCountList[i][0]
        i += 1
    i = 0
    while i < len_classCountList:
        answer_list.append(((classCountList[i][0]/(sum_rows))*100,classCountList[i][1]))
        i += 1
    return answer_list

Пример #2

Показать файл

Файл: NaiveBayes.py Проект: MachineLearningAPI/ResearchArticleClassifier

def NaiveBayes(file):
    
    #Logging to a file
    orig_stdout = sys.stdout
    #f = open('out.txt', 'w')
    #sys.stdout = f
    
    #Extract tokens from doc
    #1. Select the first 300 words which
    #2. Contains only letters (helps in removing meaningless tokens)
    #3. Must not be a stop word
    fp = open(file,encoding="latin-1")
    docCount = DocClass.objects.count()
    #print("Number of documents: ",docCount)
    
    W = fp.read().split()
    cnt=0
    processedW={}
    stopwords=loadStopWords(STOP_WORD_LIST_FILENAME)
    for w_ in W:
        w=w_.lower()
        w = get_my_string(w)
        if not w.isalpha():
            continue
        if w in processedW.keys(): 
            continue
        if w in stopwords.keys():
            continue
        processedW[w]=1
        cnt += 1
        if cnt == MAX_NUM_OF_WORDS_READ:
            break
    
    #List of all (className: "className") key value pairs
    C = DocClass.objects.all().values('className').distinct()
    #number of classes
    countC=len(C)
    
    #print("Number of classes: ",countC)

    #(probability,class) pairs
    scores=[]
    
    #number of terms
    B=DocFreqTable.objects.all().count()   
    
    #print("Number of terms: ",B) 
    
    #Outer Loop: for every class
    for c in C: 
        
        cName=c['className']
        
        #print("Calculating probabilities for class=",cName)
        
        #get all docs corresponding to current class
        docsC = DocClass.objects.filter(className = cName).values("docName")
        priorC = len(docsC)/docCount
        prob = log10(priorC) #prob is the probability of the test document of belonging to clas
        
        #number of docs of current class in which a term of vocabulary cumulative for all terms
        denC = WordTable.objects.filter(docName__in=docsC).count()
        
        #Inner Loop: for every term
        for t in processedW.keys():
            
            #number of docs of current class in which the current term appears
            numC = WordTable.objects.filter(docName__in=docsC,word=t).select_related().count()
            
            #print("Using term=",t)
            
            #how much evidence does term provide that clas is the correct class        
          
            prob= prob + log10( (1+numC)/(denC+B) )                
                                     
        scores.append( (-prob,cName) )        
       
    scores.sort()
    
    answer_list = []
    
    #make the probabilities non negative again    
    for i in scores:
        answer_list.append( ( -i[0], i[1] ) );
    
    sys.stdout = orig_stdout
    #f.close()
           
    return answer_list