Exemplo n.º 1
0
def get_vocabulary(sentences, ngrams, startIndex=1, init=[]):
    vocabulary = Vocabulary(startIndex)
    vocabulary.add_list(init)

    sentence_ins = Sentence()
    for sentence in sentences:
        sentence_ins.set_sentence(sentence)
        for n in ngrams:
            vocabulary.add_list(sentence_ins.extract_n_gram(n))

    return vocabulary
Exemplo n.º 2
0
def get_bow_vectors(list_sentences, ngrams, vocab):
    vectors = []
    sentence_ins = Sentence()
    for sentence in list_sentences:
        vector = [0] * (vocab.length() + 1)
        sentence_ins.set_sentence(sentence)
        for n in ngrams:
            ngrams_features = sentence_ins.extract_n_gram(n)
            indices = vocab.get_vector_indices_by_list(ngrams_features)
            for index in indices:
                vector[index] = 1
        vectors.append(vector)
    return vectors
Exemplo n.º 3
0
def get_window_item(sentence, window_size):
    noa_sentence = Sentence(sentence).remove_accents()
    noa_words = noa_sentence.split()
    words = sentence.split()

    vectors = []
    half_size = window_size // 2
    words = ['__begin__'] * half_size + words + ['__end__'] * half_size
    noa_words = ['__begin__'] * half_size + noa_words + ['__end__'] * half_size
    word_size = len(noa_words)

    for i in range(half_size, word_size - half_size):
        current_words = ' '.join(noa_words[i - half_size:i + half_size + 1])
        vectors.append(tuple([current_words, noa_words[i], words[i]]))

    return vectors
Exemplo n.º 4
0
class Document:
    def __init__(self, loader, colName, fields):
        self.docs = []
        self.loader = loader
        self.colName = colName
        self.fields = fields
        self.sentence_instance = Sentence()

    def load_data(self, colName):
        db_data = self.loader.db.find(colName, {})
        self.docs = []
        for item in db_data:
            self.docs.append(item)

    def preprocess(self):
        for i, doc in enumerate(self.docs):
            for field in self.fields:
                data_field = doc[field]
                self.sentence_instance.set_sentence(data_field)
                self.sentence_instance.beautify()
                self.sentence_instance.tokenize(remove=False)
                data_field = self.sentence_instance.remove()
                doc[field] = data_field
            self.docs[i] = doc

    def fit(self):
        self.load_data(self.colName)
        self.preprocess()
Exemplo n.º 5
0
 def __init__(self, name, window_size, in_list):
     super().__init__(name)
     self.window_size = window_size
     self.in_list = set(in_list)
     self.check_in_list = len(self.in_list) > 0
     self.sentences = []
     self.onelabel = dict()
     self.sentence_instance = Sentence()
     self.result = dict()
class Preprocessor(Model):
    """
        input:  raw data
        output: [list of names], [beautiful data]
        :raw: list of sentences
    """

    def __init__(self, name="preprocessor"):
        super().__init__(name)
        self.raw = []
        self.sentence_instance = Sentence()

    def set_data(self, raw):
        self.raw = raw

    def run(self):
        set_names, list_names, sentences = set(), [], []
        for sentence in self.raw:
            sentence, names = self.working_on(sentence)
            # Update sentence and list of names
            set_names.update(names)
            sentences.append(sentence)
            list_names.append(list(names))
        return sentences, list_names, set_names

    def run_for_test(self, sentence):
        self.test = dict()
        sentence, names = self.working_on(sentence)
        # Update sentence and list of names
        return sentence, names

    def restore(self, sentence, names):
        return Sentence().restore_sentence(sentence, names)

    def working_on(self, sentence):
        self.sentence_instance.set_sentence(sentence)
        sentence = self.sentence_instance.beautify()
        return sentence, self.sentence_instance.get_extracted_names()
Exemplo n.º 7
0
class InvertedIndex:
    def __init__(self):
        self.terms = dict()
        self.docs = []
        self.sentence_instance = Sentence()

    def add_docs(self, docs):
        self.docs.extend(docs)

    def preprocess(self):
        for i, doc in enumerate(self.docs):
            self.sentence_instance.set_sentence(doc)
            self.sentence_instance.beautify()
            self.sentence_instance.tokenize(remove=False)
            doc = self.sentence_instance.remove()
            self.docs[i] = doc

    def calculate(self):
        for i, doc in enumerate(self.docs):
            words = doc.split()
            self._index(words, i)

    def _index(self, words, index):
        for word in words:
            if word not in self.terms:
                self.terms[word] = {index}
            else:
                self.terms[word].add(index)

    def fit(self):
        self.preprocess()
        self.calculate()

    def test(self):
        print(self.docs)
        print(self.terms)
Exemplo n.º 8
0
 def __init__(self, loader, colName, fields):
     self.docs = []
     self.loader = loader
     self.colName = colName
     self.fields = fields
     self.sentence_instance = Sentence()
Exemplo n.º 9
0
 def __init__(self):
     self.terms = dict()
     self.docs = []
     self.sentence_instance = Sentence()
 def restore(self, sentence, names):
     return Sentence().restore_sentence(sentence, names)
 def __init__(self, name="preprocessor"):
     super().__init__(name)
     self.raw = []
     self.sentence_instance = Sentence()