Пример #1
0
class Document:
    def __init__(self, name, path):
        super().__init__()
        self.name = name
        self.path = path
        self.dictionary = Dictionary()
        self.readedWords = []
        self.totalWords = 0

    def readWords(self, stopWords=[], headers=[], fastReading=False):
        self.dictionary.clean()

        file = open(self.path, 'r', encoding="ISO-8859-1")

        lines = file.readlines()

        if headers is not None:
            for line in lines:
                for header in headers:
                    if line.startswith(header):
                        lines.remove(line)

        if fastReading is False:
            vectorizer = CountVectorizer(stop_words=stopWords)
            x = vectorizer.fit_transform(lines)
            self.readedWords = vectorizer.get_feature_names()
            self.totalWords = len(self.readedWords)

            for arrayLine in x.toarray():
                for i in range(0, len(arrayLine)):
                    if arrayLine[i] != 0:
                        self.dictionary.searchAndAddWord(
                            CountedWord(self.readedWords[i], arrayLine[i]))
        else:
            words = []
            for line in lines:
                words += line.split()
            self.totalWords = len(words)
            for word in words:
                try:
                    wordInStopList = stopWords.index(word)
                except (ValueError, AttributeError):
                    self.dictionary.searchAndAddWord(CountedWord(word.lower()))

    def clearReadedWords(self):
        self.readedWords = []
Пример #2
0
class Group:
    def __init__(self, name, path, type):
        super().__init__()
        self.name = name
        self.path = path
        self.type = type

        self.dictionary = Dictionary()
        self.documents = []

        self.totalCountedWords = 0

    def readDocuments(self, stopWords=[], headers=[], fastReading=False):
        self.dictionary.clean()

        print(f"Start reading group {self.name}, type: {self.type}")
        bar = defaultProgress(len(self.documents)).start()
        i = 0
        for document in self.documents:
            document.readWords(stopWords, headers, fastReading)

            for word in document.dictionary.words:
                self.dictionary.searchAndAddWord(
                    GroupedWord(word.text, self, word.counted, 1))

            document.clearReadedWords()
            i += 1
            bar.update(i)
        self.setTotalCountedWords()
        bar.finish()
        print(f"Done reading group {self.name}")

    def setTotalCountedWords(self):
        self.totalCountedWords = 0
        for word in self.dictionary.words:
            self.totalCountedWords += word.counted

    def __str__(self):
        return f"Group: {self.name}"