Пример #1
0
def loadFiles(fileName):
    invertedIndex = InvertedIndex.InvertedIndex()

    currentDir = os.getcwd()

    workingDir = os.getcwd()

    questionsDir = workingDir + "/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/"

    os.chdir(questionsDir)
    for sDir in glob.glob("S*"):
        dataDir = questionsDir+sDir+"/data/"
        os.chdir(dataDir)
        print(sDir)
        for set in glob.glob("set*"):
            os.chdir(dataDir+set)
            print(set)
            for file in glob.glob("*.clean"):
                fullFileName = dataDir+set+"/"+file
                print(fullFileName)
                stemmedFile = stemText(path(file).text(encoding="utf8"))
                print("File stemmed")
                invertedIndex.indexDocument(stemmedFile, fullFileName)
                print("File added to index")

    os.chdir(currentDir)

    invertedIndex.save(fileName)
Пример #2
0
    def loadIncludedCorpusFiles(self, fileName):
        """Load the included corpus files into the current Inverted Index"""

        currentDir = os.getcwd()

        workingDir = os.getcwd()

        questionsDir = workingDir + "/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/"

        os.chdir(questionsDir)
        for sDir in glob.glob("S*"):
            dataDir = questionsDir + sDir + "/data/"
            os.chdir(dataDir)
            print(sDir)
            for set in glob.glob("set*"):
                os.chdir(dataDir + set)
                print(set)
                for file in glob.glob("*.clean"):
                    fullFileName = dataDir + set + "/" + file
                    print(fullFileName)
                    stemmedFile = stemText(Path(file).text(encoding="utf8"))
                    print("File stemmed")
                    self.indexDocument(stemmedFile, fullFileName)
                    print("File added to index")

        os.chdir(currentDir)

        self.save(fileName)
Пример #3
0
    def runQuery(self, query):
        stemmedQuery = stemText(query)

        docsWithTermProximity = self.mergeEachPostingPair(stemmedQuery, 5)

        # print(docsWithTermProximity)

        shortList = self.consolidateDocProximityList(docsWithTermProximity, 5)
        self.showDocumentText(shortList, 10)
Пример #4
0
    def showDocumentText(self, documentsByTermProximity, distanceFromTerm):
        """Show text that may answer the query from the top 3 most relevant documents determined with cosine similarity to query"""

        #for termPairProximity in documentsByTermProximity:
        totalSections = 0

        print("Documents Retrived: " + str(len(documentsByTermProximity)))

        blurbInvertedIndex = InvertedIndex.InvertedIndex()
        blurbList = []

        for doc in documentsByTermProximity:  #termPairProximity:

            docId = doc[0]

            sectionsFound = (len(doc[1]) / 2)

            documentFile = self.invertedIndex.listOfFiles[docId - 1]
            rawDocumentText = Path(documentFile).text(encoding='utf8')
            documentText = [
                word.lower().replace('\n', '')
                for word in rawDocumentText.split(' ') if word.strip() != ''
            ]

            i = 0

            while i < len(doc[1]) - 1:
                positionA = doc[1][i]
                positionB = doc[1][i + 1]

                if positionA - distanceFromTerm > 0:
                    positionA = positionA - distanceFromTerm
                else:
                    positionA = 0

                termsInDoc = ' '.join(
                    documentText[positionA:(positionB + distanceFromTerm)])

                blurbList.append(termsInDoc)

                blurbInvertedIndex.indexDocument(stemText(termsInDoc), '')

                i += 2

            totalSections += sectionsFound

        print("Sections Retrieved: " + str(totalSections))

        blurbMatrix = blurbInvertedIndex.createTermDocMatrix()
        bestBlurbs = self.getKNearestDocs(self.query, blurbMatrix, 3)

        print("Showing Best 3 Results:")
        for blurbNum in bestBlurbs:
            print(blurbNum)
            print(blurbList[blurbNum - 1])
Пример #5
0
from Parser import stemText
from Query import Query
import pandas as pd

import InvertedIndex
from cosineSim import cosineSim

fulldoc = path('C:\Users/admin/Documents/575/parser/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/S08/data/set1/a1.txt.clean').bytes()
docArray = [para for para in fulldoc.split('\n') if para.strip() != '']


print(len(docArray))

#print(docArray[4].split(' '))

print(stemText(docArray[4]))


tester = InvertedIndex.InvertedIndex()

for step in docArray:

    stemmed = stemText(step)

    tester.indexDocument(stemmed)

docTermMatrix = tester.createTermDocMatrix()

pd.set_option('display.max_columns', 150)
#print(termDocMatrix.head())
Пример #6
0
from Parser import stemText
from Query import Query
import itertools

import InvertedIndex

fulldoc = path(
    'C:\Users/admin/Documents/575/parser/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/S08/data/set1/a1.txt.clean'
).bytes()
docArray = [para for para in fulldoc.split('\n') if para.strip() != '']

tester = InvertedIndex.InvertedIndex()

for step in docArray:

    stemmed = stemText(step)

    tester.indexDocument(stemmed)

print('running query')
queryObj = Query(tester)
queryText = "kangaroo marsupial"
stemmedQuery = stemText(queryText)

docsWithTermProximity = queryObj.mergeEachPostingPair(stemmedQuery, 5)

print(docsWithTermProximity)

shortList = queryObj.consolidateDocProximityList(docsWithTermProximity, 5)
queryObj.showDocumentText(shortList, docArray, 10)