Пример #1
0
def run_feature_extraction(run_from_scratch, df_corpus):
    """ Run feature extraction if run_from_scratch=True """
    if run_from_scratch:
        print("\nExtracting features ...")
        df_extracted_features = FeatureExtractor(
            df_corpus).get_df_with_all_features()
        df_extracted_features = df_extracted_features.drop(
            ["original_content", "content", "tokens", "pos", "stems"], axis=1)
        df_extracted_features.to_csv(
            str(get_project_root()) +
            "/data/extracted_features/extracted_features.csv")
        return df_extracted_features
    else:
        df_extracted_features = pd.read_csv(
            str(get_project_root()) +
            "/data/extracted_features/extracted_features.csv")
        return df_extracted_features
Пример #2
0
 def chose_images(self):
     images = [
         cv.imread(join(self.IMAGE_PATH, f))
         for f in listdir(self.IMAGE_PATH)
         if isfile(join(self.IMAGE_PATH, f)) and f.endswith('.png')
     ]
     images = [img for img in images if img is not None]
     print(len(images))
     features = [(im, FeatureExtractor(im).extract_features_dominant())
                 for im in images]
     return features
Пример #3
0
 def load_data(self):
     records = []
     for file in os.listdir(self.PATH):
         if file.endswith(".jpeg"):
             image = cv.imread(self.PATH + '\\' + file)
             is_cancer = file.startswith('xx')
             value = 1 if is_cancer else 0
             records.append(
                 Record(
                     FeatureExtractor(
                         image).filter_img().extract_features(), value))
     return records
 def test__constructor(self):
     data = [
         [0, "Hello", "hello1", "hel", "POS", ["tok", "en"]],
         [0, "Hello2", "hello2", "hel", "POS", ["tok", "en"]],
         [1, "Hello3", "hello3", "hel", "POS", ["tok", "en"]],
         [1, "Hello4", "hello4", "hel", "POS", ["tok", "en"]],
     ]
     df = pd.DataFrame(
         data=data,
         columns=["class", "original_content", "content", "stems", "pos", "tokens"],
     )
     feature_extractor = FeatureExtractor(df)
     df_extracted = feature_extractor.get_df_with_all_features()
     expected = [
         "class",
         "original_content",
         "content",
         "stems",
         "pos",
         "tokens",
         "number_of_exclamation_mark",
         "number_of_question_mark",
         "number_of_full_stop_mark",
         "number_of_hateful_words",
         "number_of_neutral_words",
         "number_of_interjections",
         "number_of_all_caps_words",
         "number_of_quotation_marks",
         "number_of_words",
         "number_of_laughing_expressions",
         "hate_speech_unigrams",
         "hate_speech_bigrams",
         "hate_speech_trigrams",
         "pattern_count",
         "sentiment",
         "topic",
     ]
     self.assertTrue(df_extracted.columns.values.tolist() == expected)
Пример #5
0
    fix_html=True,
    segmenter='twitter',
    corrector='twitter',
    unpack_contractions=True,
    spell_correct_elong=False,
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons])

logger = Logger()
runner = Runner(logger=logger, ternary=TERNARY,
                model_type='baseline', use_embeddings=True)

logger.write('preprocessing: %s' % (True if preprocessor else False))

data_loader = DataLoader(preprocessor=preprocessor)
train, test = data_loader.get_train_test(ternary=TERNARY)
extra_train = data_loader.get_train(ternary=TERNARY, \
    paths=['data/ydata-ynacc-v1_0_expert_annotations_filt.tsv'])

feature_data_loader = DataLoader(preprocessor=feature_preprocessor)
feature_extractor = FeatureExtractor(data_loader=feature_data_loader,
                                     logger=logger)

train_feats, test_feats = feature_extractor.get_train_test_features(
    ternary=TERNARY,
    manual=True,
    auto=True,
    scaled=False)

runner.run(train, test, extra_train=extra_train)
f_dict = {}
rootdir_bening = os.path.join("inputs", "benign", "wordpress")
# i = 0
prev = 0
for root, subFolders1, files in os.walk(rootdir_bening):
    for _file in files:
        if _file.endswith(".php"):
            file_path = root + "/" + _file
            print file_path
            # with open(file_path, "r") as fin:
            #     _file = fin.read()
            # with open(file_path, "a") as fin:
            #     if not _file.rstrip(" ").rstrip("\n").endswith("?>"):
            #         fin.write("?>")
            fe = FeatureExtractor(file_path)
            try:
                feature_dict = fe.extract_features()
                feature_dict["label"] = 1
                f_dict[file_path] = feature_dict
                f_dict[file_path]["number_of_lines"] -= prev
                prev += f_dict[file_path]["number_of_lines"]
                f_dict[file_path]["number_of_lines"] += 1
                chars = f_dict[file_path]["length_in_characters"]
                lines = f_dict[file_path]["number_of_lines"]
                f_dict[file_path]["characters_per_line"] = float(chars) / lines
                number_of_comments = f_dict[file_path]["number_of_comments"]
                f_dict[file_path]["average_comments_per_line"] = float(
                    number_of_comments) / lines
            except:
                f_dict.pop(file_path, None)
Пример #7
0
from sklearn import linear_model

from feature_extraction.feature_extractor import FeatureExtractor


class FactFeelRegressor():
    def __init__(self):

        with open(
                os.path.join(os.getcwd(), "Fact-Feel-App", "model",
                             "Fact_Feel_noramlizer.pkl"), "rb") as f_p:
            self.normalizer = pickle.load(f_p)

        self.model = self.load_model()

    def load_model(self):
        with open(
                os.path.join(os.getcwd(), "Fact-Feel-App", "model",
                             "Linear_Model_qr_py3.pkl"), "rb") as f_p:
            return pickle.load(f_p, encoding="latin-1")

    def predict(self, data):
        t_data = self.normalizer.transform(data)
        return self.model.predict(t_data) * -1


if __name__ == "__main__":
    print(os.getcwd())
    MODEL_ = FactFeelRegressor()
    FE_ = FeatureExtractor()
Пример #8
0
def extract_features(filepath):
    feature_extractor = FeatureExtractor(
        'config_files/feature_extraction.json')
    return feature_extractor.extract_features(filepath)
Пример #9
0
@author: James
"""

import os
import json

from flask import Flask, render_template, request, jsonify
import logging

from model.Fact_Feel_Regression import FactFeelRegressor
from feature_extraction.feature_extractor import FeatureExtractor

log = logging.getLogger("FactFeel_log.txt")

ff_model = FactFeelRegressor()
feat_extractor = FeatureExtractor()      
                  
app = Flask(__name__)
app.config['JSONIFY_PRETTYPRINT_REGULAR'] = False

#class NumpyArrayEncoder(JSONEncoder):
#    def default(self, obj):
#        if isinstance(obj, numpy.ndarray):
#            return obj.tolist()
#        return JSONEncoder.default(self, obj)

@app.route("/")
def home():
    return render_template("home.html",prediction="2.56")

@app.route("/predict",methods=["POST"])