Пример #1
0
        string = re.split(
            ' |-|\n|\u00e3|\u2019|\u201c|\u201d|\u2014|\u2018|\u00a9|\u00af|\u00aa|\u00b4|\u00a7|\u00a8',
            f.read())

        location = 0

        for word in string:

            word = word.lower()
            # Stripping word and removing " ' : ; - _ # + @ ( ) / ? ~ ` [ ] { } =
            word = word.strip(
                ',|!|.|"|;|:|-|_|#|+|@|)|(|/|?|~|`|[|]|{|}|=|\u00e3')

            if word not in stopWords:  # If word is not a stopword

                word = stemmer.stemWord(word)  # Stemming

                # ========= Building Inverted Index =========
                # If the word already exists in the II, Append the word's document number in the II.
                # If the word does not exist in the II, add the word as a key and also add the document number.
                if word in invertedIndex:
                    if file.split('.')[0] not in invertedIndex[word]:
                        invertedIndex[word].append(file.split('.')[0])
                else:
                    invertedIndex[word] = [file.split('.')[0]]

            # ========= Building Positional Index =========
            # If the word already exists in the PI, Append the word's document number and its position in the PI.
            # If the word does not exist in the OI, add the word as a key and also add the document number and the position of the word.
            if word in positionalIndex:
                if word not in stopWords: