예제 #1
0
    def load_log_files(dirs):
        """
        :param dirs: list of directories from where to load log files
        :return: log_files dataset and labels for MLP, also the list of all logs' names and paths
        """

        data_set = list()
        labels = list()
        names = list()
        paths = list()

        for iterator in range(0, len(dirs)):
            files = os.listdir(dirs[iterator])
            names.append(files)
            for index in range(len(files)):
                paths.append(dirs[iterator])

            for file in files:
                log_file = open(dirs[iterator] + '/' + file, 'r')

                feature_vector = list()  # feature vector for an entire log file

                lines = log_file.readlines()
                for line in lines:
                    line_vector = hashing_trick(text=line,
                                                n=100000,
                                                hash_function=None,
                                                filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                                lower=True,
                                                split=' ')

                    for element in line_vector:
                        feature_vector.append(float(element))

                data_set.append(feature_vector)
                labels.append(iterator + 1)

        # Make shape of dataset uniform for NN
        ######################################
        min_len = copy(BIG_NUMBER)
        for array in data_set:
            min_len = min(min_len, len(array))

        for iterator in range(0, len(data_set)):
            data_set[iterator] = data_set[iterator][:min_len]
        ######################################

        # use min_max scaling
        scaler = MinMaxScaler()
        data_set = scaler.fit_transform(np.array(data_set))

        names = merge_splits(names)

        return data_set, np.array(labels), names, paths
예제 #2
0
def prep_1(text):
    text = "The quick brown fox jumped over the lazy dog."

    list_unique_words = list(set(text_to_word_sequence(text)))
    print(f"docs: {list_unique_words[:100]}")

    vocab_size = len(list_unique_words)
    print(f"vocab_size: {vocab_size}")

    oh_encoding = one_hot(text, n=round(vocab_size * 1.3))
    print(f"oh_encoding: {oh_encoding}")

    hashed_doc = hashing_trick(text,
                               n=round(vocab_size * 1.3),
                               hash_function='md5')
    print(f"hashed_doc: {hashed_doc}")

    return oh_encoding
예제 #3
0
def test_hashing_trick_md5():
    sample_text = 'The cat sat on the mat.'
    encoded = text.hashing_trick(sample_text, 5, hash_function='md5')
    assert len(encoded) == 6
    assert np.max(encoded) <= 4
    assert np.min(encoded) >= 1
예제 #4
0
def test_hashing_trick_hash():
    sample_text = 'The cat sat on the mat.'
    encoded = text.hashing_trick(sample_text, 5)
    assert len(encoded) == 6
    assert np.max(encoded) <= 4
    assert np.min(encoded) >= 1
예제 #5
0
파일: keras.py 프로젝트: karobolas/repipe
 def _transform(X: pd.Series) -> List[List[int]]:
     return [
         hashing_trick(text, n=self._hash_slots)
         for i, text in enumerate(X.str.lower())
     ]