예제 #1
0
def extract_features(segment):
    data = np.asarray(extract(np.asarray(segment)))
    data = np.array([data])
    data_chu = scaler_chu.transform(data)
    data_df = scaler_df.transform(data)

    return data_chu, data_df
예제 #2
0
def prepareData(ratings, reviews):
    # We call the extract method to retrieve the most relevant words (features)
    features = extract(reviews)

    # For each review, we only keep the words that are features
    filtered_tokens = [filter(review, features) for review in reviews]
    filtered_reviews = []
    for f in filtered_tokens:
        review = ""
        for t in f:
            review = review + t + " "
        filtered_reviews.append(review)

    # We create a column for each feature
    # If that feature is mentioned in the review : the value is 1, else 0
    cv = CountVectorizer(binary=True)
    x = cv.fit_transform(filtered_reviews)

    x_df = pd.DataFrame(x.toarray(), columns=cv.get_feature_names())

    # To simplify our classification, we make sure that high ratings are considered positive (1),
    # while low ones are considered negative (0)
    ratings = transform_rating(ratings)

    return ratings, x_df
 def ext_feat():
     for i, file in enumerate(files):
         _, sig = wav.read(os.path.join(DATA_SET_PATH, file))
         feats = extract(sig)
         features.append(feats)
         words_labels.append(Label_Map[file[0]])
         gender_lables.append(Label_Map[file[1]])
         bar.update(i + 1)
예제 #4
0
def read(path):
    global count, thresh, split_count
    X = []
    y = []
    sents = []
    file = open(path, "r")
    data = file.read().split('.')
    for i in tqdm(range(len(data))):
        d = data[i]
        thresh -= 1
        if thresh > 0 or len(d) == 0:
            continue
        d = re.sub(".\s\)", "", d).replace('<', ' <').replace('>', '> ')
        d = re.sub(">", "> ", d)
        d = re.sub(r"(<[^><\s]+)", r"\1>", d)
        d = re.sub(">>", ">", d)
        d = re.sub('<<', '<', d)
        d = re.sub("</", " </", d).split()
        #         d = d.split()
        sentence, label = gen_label(d)
        sentence = remove_tags(sentence)
        sentence = re.sub(r'<.*?>', '', ' '.join(sentence)).split()
        sentence = np.array(sentence)
        label = np.array(label)
        #         print(np.unique(label))
        if label.shape != sentence.shape:
            print(len(label), len(sentence))
            print(label)
            print(sentence)
            print(data[i])
        assert label.shape == sentence.shape
        list_sent = windowing(sentence)
        list_label = windowing(label)
        assert len(list_label) == len(list_sent)
        assert len(list_sent[0]) == len(list_label[0])

        for i in range(len(list_sent)):
            sentence = ' '.join(list_sent[i])
            l = list_label[i]
            # manual + fasttext
            manual_feat = extract(sentence.split())

            # character encoding
            char_list = sent_to_char(sentence.split())

            # phobert embedding
            sentence = extract_bert(sentence)
            sentence = np.hstack((sentence, manual_feat, char_list))
            pad_len = SENT_LENGTH - len(l)
            l += ['pad'] * (pad_len)
            l = np.array(l, dtype='<U12')
            sentence = np.append(sentence,
                                 np.zeros((pad_len, sentence.shape[1])),
                                 axis=0)
            X.append(sentence)
            y.append(l)
        #print(sentence.shape, label.shape, '\n')
    return np.array(X), np.array(y, dtype='<U12')
예제 #5
0
def extract_features(segment):
    #print('intial ======')
    #print(segment)
    data = np.asarray(extract(np.asarray(segment)))
    #print(data)
    data = np.array([data])
    data_chu = scaler_chu.transform(data)
    data_df = scaler_df.transform(data)
    #print('data before transform ========')
    #print(data)
    #data = scaler_df.transform(data)
    #data = scaler.transform(data)
    #print('data after transform ========')
    #print(data)
    return data_chu, data_df
def keep_predicting():
    # remove previous files
    for file in os.listdir(TEST_PATH):
        os.remove(os.path.join(TEST_PATH, file))
    while True:
        try:
            for file in os.listdir(TEST_PATH):
                rate, sig = wav.read(os.path.join(TEST_PATH, file))
                feat = extract(sig)
                pca_feats = pca_transform([feat])
                result = words_clf.predict(pca_feats)
                result2 = gender_clf.predict(pca_feats)
                play(result[0])
                play(result2[0])
                os.remove(os.path.join(TEST_PATH, file))
        except Exception as e:
            print(e.__str__())
예제 #7
0
def classify(train, test):
  vect1, vect2 = feature_extraction.extract(train, test)
  classifiers = [MultinomialNB(), SGDClassifier(random_state=0)]

  for classifier in classifiers:
    predict = cross_val_predict(classifier, vect1, train.Label_Cat)
    print(predict)
    print(precision_score(train.Label_Cat, predict, average='micro'))
    print(recall_score(train.Label_Cat, predict, average='micro'))
    print(f1_score(train.Label_Cat, predict, average='micro'))

    print(classification_report(train.Label_Cat, predict))
    print(accuracy_score(train.Label_Cat, predict))

    print('on test data')
    classifier.fit(vect1, train.Label_Cat)
    y_pred = classifier.predict(vect2)
    print(classification_report(test.Label_Cat, y_pred))
    print(accuracy_score(test.Label_Cat, y_pred))
예제 #8
0
segmentation_test.segment(patients, path_to_data, path_to_results)

#-------------------Feature Extraction Tsfresh---------------------

patients = [
    "Subject_1", "Subject_3", "Subject_4", "Subject_7", "Subject_10",
    "Subject_11", "Subject_12", "Subject_13", "Subject_14", "Subject_15",
    "Subject_16", "Subject_17"
]
path_data = "Sliding_Window_Data" + os.sep + "Sensor_Data"
path_result_features = "Features"

if not os.path.exists(path_result_features):
    os.mkdir(path_result_features)

feature_extraction.extract(patients, path_data, path_result_features)

#----------------------Extract Labels --------------------

path_data = "Sliding_Window_Data" + os.sep + "Labels"
subjects = [
    "Subject_1", "Subject_3", "Subject_4", "Subject_7", "Subject_10",
    "Subject_11", "Subject_12", "Subject_13", "Subject_14"
]

extract_labels.labels(path_data, subjects)

#-------------------Remove Unnecessary Features------------

path_result_selected_features = "Selected_features_Data"
path_data = "Features"
예제 #9
0
    labels = np.concatenate(
        (epo_eeg_p1_tr_cl.events[:, 2], epo_eeg_p2_tr_cl.events[:, 2]))
    epoch = Datmat.transpose(2, 0, 1)
    df = pd.DataFrame()

    for ch in range(epoch.shape[2]):
        feature = extract(
            epoch[:, :, ch],
            fs,
            0.1,
            amplitude=True,
            amplitude_P300=True,
            kurtosis=True,
            skewness=True,
            std=True,
            sampen=True,
            rms=True,
            hurst=True,
            gradient=True,
            alfa=True,
            beta=True,
            theta=True,
            delta=True,
            broad_band=True,
        )

        current = pd.DataFrame(feature)
        current['class'] = labels - 1

        df = pd.concat([df, current], ignore_index=True)
예제 #10
0
import pandas as pd
import json
import os
from feature_extraction import extract

window_size = 50
step_size = 10  #80% overlap seems to work for us

dir = os.listdir('./13april_data/')
result = []
#print(dir)
for csv_file in dir:
    dataframe = pd.read_csv(os.path.join('13april_data', csv_file))
    dataset = dataframe.values
    #print(dataset)
    for row in range(int((len(dataset) - window_size) / step_size)):
        processed = extract(dataset[row * step_size:row * step_size +
                                    window_size])
        processed.append(dataset[row][-1])
        # print(processed)
        #print(len(dataset))
        result.append(processed)

df = pd.DataFrame(result)
df.to_csv('preprocess_17april_logout.csv', header=0)