class PositionalIndex(): def __init__(self): self.collection = [['a', 'word', 'a', 'word', 'the'], ['the', 'a', 'brown', 'cat', 'the', 'a'], ['brown', 'cat', 'the', 'a', 'word']] self.dictionary = {} self.stopWords = StopWords( "D:/Information Retrieval/IR/stop words.txt") def loadDocuments(self): self.collection = [] for i in range(1, 51): filename = "D:/Information Retrieval/IR/ShortStories/" + str( i) + ".txt" s = "" with open(filename) as f_obj: for line in f_obj: if (line != '\n'): l = re.sub('[^a-zA-Z0-9\s]|[\n]', '', line) l = self.stopWords.removeWords(l) s = s + l.lower() + " " lines = s.split(" ") self.collection.append(lines) def buildDictionary(self): for i in range(0, len(self.collection)): array = self.collection[i] for j in range(0, len(array)): if (array[j] not in self.dictionary): docId = i + 1 d = {docId: [j]} self.dictionary[array[j]] = d else: d = self.dictionary[array[j]] if (i + 1) in d: l = d[i + 1] l.append(j) d[i + 1] = l else: docId = i + 1 d[docId] = [j] self.dictionary[array[j]] = d def getPositionalIndex(self, key): if key not in self.dictionary: return [] return self.dictionary.get(key)
class InvertedIndex(): def __init__(self): self.collection = [ ['a','word','a','word','the'], ['the', 'a', 'brown', 'cat', 'the', 'a'], ['brown', 'cat', 'the', 'a', 'word'] ] self.dictionary = {} self.stopWords = StopWords("D:/Information Retrieval/IR/stop words.txt") def loadDocuments(self): self.collection = [] for i in range(1, 51): filename = "D:/Information Retrieval/IR/ShortStories/"+str(i)+".txt" s = "" with open(filename) as f_obj: for line in f_obj: if(line != '\n'): l = re.sub('[^a-zA-Z0-9\s]|[\n]', '', line) l = self.stopWords.removeWords(l) s = s + l.lower() + " " lines = s.split(" ") self.collection.append(lines) def buildDictionary(self): for i in range(0 ,len(self.collection)): array = self.collection[i] for j in range(0,len(array)): if(array[j] not in self.dictionary): l = [] l.append(i+1) self.dictionary[array[j]] = l else: l = self.dictionary[array[j]] l.append(i+1) self.dictionary[array[j]] = l for key,value in self.dictionary.items(): self.dictionary[key] = list(set(value)) def getInvertedIndex(self, key): if key not in self.dictionary: return [] return self.dictionary.get(key)
class Frequency(): def __init__(self): self.collection = [['w1', 'w2', 'w4', 'w6'], ['w1', 'w2', 'w7', 'w3'], ['w8', 'w5', 'w4', 'w5', 'w6']] self.dictionary = {} self.stopWords = StopWords( "D:/Information Retrieval/Assignment 2/stop words.txt") def loadDocuments(self): self.collection = [] for i in range(1, 51): filename = "D:/Information Retrieval/Assignment 2/ShortStories/" + str( i) + ".txt" s = "" with open(filename) as f_obj: for line in f_obj: if (line != '\n'): l = re.sub('[^a-zA-Z0-9\s]|[\n]', '', line) l = self.stopWords.removeWords(l.lower()) s = s + l.lower() + " " # print(l.lower()) lines = s.split(" ") self.collection.append(lines) def buildDictionary(self): for i in range(0, len(self.collection)): array = self.collection[i] for j in range(0, len(array)): if (array[j] not in self.dictionary): docId = i + 1 d = {docId: 1} self.dictionary[array[j]] = d else: d = self.dictionary[array[j]] if (i + 1) in d: l = d[i + 1] l = l + 1 d[i + 1] = l else: docId = i + 1 d[docId] = 1 self.dictionary[array[j]] = d def getTermFrequency(self, key): if key not in self.dictionary: return [] return self.dictionary.get(key) def getDocumentFrequency(self, key): if key not in self.dictionary: return [] return list(self.dictionary.get(key).keys()) def getWords(self): return list(self.dictionary.keys()) def getIdf(self, N): words = self.getWords() idf = [ math.log10(N / len(self.getDocumentFrequency(x))) for x in words ] return idf