dlls_vectorizer.fit(dlls_train)
    dlls_train = dlls_vectorizer.transform(dlls_train)
    dlls_test = dlls_vectorizer.transform(dlls_test)

    symbols_model = gensim.models.Word2Vec.load('w2v/symbols.100.5.model')
    symbols_w2v = dict(zip(symbols_model.index2word, symbols_model.syn0))
    #symbols_vectorizer = MeanEmbeddingVectorizer(symbols_w2v)
    symbols_vectorizer = TfidfEmbeddingVectorizer(symbols_w2v)

    symbols_vectorizer.fit(symbols_train)
    symbols_train = symbols_vectorizer.transform(symbols_train)
    symbols_test = symbols_vectorizer.transform(symbols_test)

    # concatenate numerical and textual features
    X_train = concatenate_features(
        features_train, [identifiers_train, dlls_train, symbols_train])
    X_test = concatenate_features(features_test,
                                  [identifiers_test, dlls_test, symbols_test])

    # normalize features
    if classifier == "svcrbf":
        normalization = MinMaxScaler(feature_range=(-1, 1))
    else:
        normalization = MinMaxScaler()
    normalization.fit(X_train)
    X_train = normalization.transform(X_train)
    X_test = normalization.transform(X_test)

    # use in a classifier
    if classifier == "svc":
        clf = SVC(kernel="linear", probability=THRESHOLD)

# csv header
h_tf = []
for i in range(0,i_dlls-1):
    h_tf.append(USED_FEATURES[i])
h_tf.append("Time")
for i in range(1,dlls_tf.shape[1]+1):
    h_tf.append("dlls"+str(i))
for i in range(1,symbols_tf.shape[1]+1):
    h_tf.append("symbols"+str(i))
h_tf.append("label")
h_tf.append("md5")

num_features = np.array(num_features)

# concatenate numerical and textual features
X_tf = concatenate_features(num_features,[dlls_tf, symbols_tf])

# normalization
normalization_tfidf = MinMaxScaler()
normalization_tfidf.fit(X_tf)
X_tf = normalization_tfidf.transform(X_tf)

# write features csv
csv_ordered = open('mw_features_full_sorted_2.csv', "a")
c_ordered = csv.writer(csv_ordered)
c_ordered.writerow(h_tf)
for i in range(X_tf.shape[0]):
    c_ordered.writerow(np.concatenate((X_tf[i], [labels[i]], [md5s[i]]),axis=0))
# train tfidf for each textual feature
identifiers_tfidf = TfidfVectorizer(max_features=N_FEATURES)
identifiers_tfidf.fit(identifiers)
identifiers_tf = identifiers_tfidf.transform(identifiers)

dlls_tfidf = TfidfVectorizer(max_features=N_FEATURES)
dlls_tfidf.fit(dlls)
dlls_tf = dlls_tfidf.transform(dlls)

symbols_tfidf = TfidfVectorizer(max_features=N_FEATURES)
symbols_tfidf.fit(symbols)
symbols_tf = symbols_tfidf.transform(symbols)

# concatenate numerical and textual features
X_tf = concatenate_features(features,[identifiers_tf, dlls_tf, symbols_tf])

# W2V

identifiers_model = gensim.models.Word2Vec.load('w2v/identifiers.100.5.model')
identifiers_w2v = dict(zip(identifiers_model.index2word, identifiers_model.syn0))
identifiers_vectorizer = MeanEmbeddingVectorizer(identifiers_w2v)
identifiers_vectorizer.fit(identifiers)
identifiers_w2v = identifiers_vectorizer.transform(identifiers)

dlls_model = gensim.models.Word2Vec.load('w2v/dlls.100.5.model')
dlls_w2v = dict(zip(dlls_model.index2word, dlls_model.syn0))
dlls_vectorizer = MeanEmbeddingVectorizer(dlls_w2v)
dlls_vectorizer.fit(dlls)
dlls_w2v = dlls_vectorizer.transform(dlls)