class Document: def __init__(self, name, path): super().__init__() self.name = name self.path = path self.dictionary = Dictionary() self.readedWords = [] self.totalWords = 0 def readWords(self, stopWords=[], headers=[], fastReading=False): self.dictionary.clean() file = open(self.path, 'r', encoding="ISO-8859-1") lines = file.readlines() if headers is not None: for line in lines: for header in headers: if line.startswith(header): lines.remove(line) if fastReading is False: vectorizer = CountVectorizer(stop_words=stopWords) x = vectorizer.fit_transform(lines) self.readedWords = vectorizer.get_feature_names() self.totalWords = len(self.readedWords) for arrayLine in x.toarray(): for i in range(0, len(arrayLine)): if arrayLine[i] != 0: self.dictionary.searchAndAddWord( CountedWord(self.readedWords[i], arrayLine[i])) else: words = [] for line in lines: words += line.split() self.totalWords = len(words) for word in words: try: wordInStopList = stopWords.index(word) except (ValueError, AttributeError): self.dictionary.searchAndAddWord(CountedWord(word.lower())) def clearReadedWords(self): self.readedWords = []
class Group: def __init__(self, name, path, type): super().__init__() self.name = name self.path = path self.type = type self.dictionary = Dictionary() self.documents = [] self.totalCountedWords = 0 def readDocuments(self, stopWords=[], headers=[], fastReading=False): self.dictionary.clean() print(f"Start reading group {self.name}, type: {self.type}") bar = defaultProgress(len(self.documents)).start() i = 0 for document in self.documents: document.readWords(stopWords, headers, fastReading) for word in document.dictionary.words: self.dictionary.searchAndAddWord( GroupedWord(word.text, self, word.counted, 1)) document.clearReadedWords() i += 1 bar.update(i) self.setTotalCountedWords() bar.finish() print(f"Done reading group {self.name}") def setTotalCountedWords(self): self.totalCountedWords = 0 for word in self.dictionary.words: self.totalCountedWords += word.counted def __str__(self): return f"Group: {self.name}"