def run_feature_extraction(run_from_scratch, df_corpus): """ Run feature extraction if run_from_scratch=True """ if run_from_scratch: print("\nExtracting features ...") df_extracted_features = FeatureExtractor( df_corpus).get_df_with_all_features() df_extracted_features = df_extracted_features.drop( ["original_content", "content", "tokens", "pos", "stems"], axis=1) df_extracted_features.to_csv( str(get_project_root()) + "/data/extracted_features/extracted_features.csv") return df_extracted_features else: df_extracted_features = pd.read_csv( str(get_project_root()) + "/data/extracted_features/extracted_features.csv") return df_extracted_features
def chose_images(self): images = [ cv.imread(join(self.IMAGE_PATH, f)) for f in listdir(self.IMAGE_PATH) if isfile(join(self.IMAGE_PATH, f)) and f.endswith('.png') ] images = [img for img in images if img is not None] print(len(images)) features = [(im, FeatureExtractor(im).extract_features_dominant()) for im in images] return features
def load_data(self): records = [] for file in os.listdir(self.PATH): if file.endswith(".jpeg"): image = cv.imread(self.PATH + '\\' + file) is_cancer = file.startswith('xx') value = 1 if is_cancer else 0 records.append( Record( FeatureExtractor( image).filter_img().extract_features(), value)) return records
def test__constructor(self): data = [ [0, "Hello", "hello1", "hel", "POS", ["tok", "en"]], [0, "Hello2", "hello2", "hel", "POS", ["tok", "en"]], [1, "Hello3", "hello3", "hel", "POS", ["tok", "en"]], [1, "Hello4", "hello4", "hel", "POS", ["tok", "en"]], ] df = pd.DataFrame( data=data, columns=["class", "original_content", "content", "stems", "pos", "tokens"], ) feature_extractor = FeatureExtractor(df) df_extracted = feature_extractor.get_df_with_all_features() expected = [ "class", "original_content", "content", "stems", "pos", "tokens", "number_of_exclamation_mark", "number_of_question_mark", "number_of_full_stop_mark", "number_of_hateful_words", "number_of_neutral_words", "number_of_interjections", "number_of_all_caps_words", "number_of_quotation_marks", "number_of_words", "number_of_laughing_expressions", "hate_speech_unigrams", "hate_speech_bigrams", "hate_speech_trigrams", "pattern_count", "sentiment", "topic", ] self.assertTrue(df_extracted.columns.values.tolist() == expected)
fix_html=True, segmenter='twitter', corrector='twitter', unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]) logger = Logger() runner = Runner(logger=logger, ternary=TERNARY, model_type='baseline', use_embeddings=True) logger.write('preprocessing: %s' % (True if preprocessor else False)) data_loader = DataLoader(preprocessor=preprocessor) train, test = data_loader.get_train_test(ternary=TERNARY) extra_train = data_loader.get_train(ternary=TERNARY, \ paths=['data/ydata-ynacc-v1_0_expert_annotations_filt.tsv']) feature_data_loader = DataLoader(preprocessor=feature_preprocessor) feature_extractor = FeatureExtractor(data_loader=feature_data_loader, logger=logger) train_feats, test_feats = feature_extractor.get_train_test_features( ternary=TERNARY, manual=True, auto=True, scaled=False) runner.run(train, test, extra_train=extra_train)
f_dict = {} rootdir_bening = os.path.join("inputs", "benign", "wordpress") # i = 0 prev = 0 for root, subFolders1, files in os.walk(rootdir_bening): for _file in files: if _file.endswith(".php"): file_path = root + "/" + _file print file_path # with open(file_path, "r") as fin: # _file = fin.read() # with open(file_path, "a") as fin: # if not _file.rstrip(" ").rstrip("\n").endswith("?>"): # fin.write("?>") fe = FeatureExtractor(file_path) try: feature_dict = fe.extract_features() feature_dict["label"] = 1 f_dict[file_path] = feature_dict f_dict[file_path]["number_of_lines"] -= prev prev += f_dict[file_path]["number_of_lines"] f_dict[file_path]["number_of_lines"] += 1 chars = f_dict[file_path]["length_in_characters"] lines = f_dict[file_path]["number_of_lines"] f_dict[file_path]["characters_per_line"] = float(chars) / lines number_of_comments = f_dict[file_path]["number_of_comments"] f_dict[file_path]["average_comments_per_line"] = float( number_of_comments) / lines except: f_dict.pop(file_path, None)
from sklearn import linear_model from feature_extraction.feature_extractor import FeatureExtractor class FactFeelRegressor(): def __init__(self): with open( os.path.join(os.getcwd(), "Fact-Feel-App", "model", "Fact_Feel_noramlizer.pkl"), "rb") as f_p: self.normalizer = pickle.load(f_p) self.model = self.load_model() def load_model(self): with open( os.path.join(os.getcwd(), "Fact-Feel-App", "model", "Linear_Model_qr_py3.pkl"), "rb") as f_p: return pickle.load(f_p, encoding="latin-1") def predict(self, data): t_data = self.normalizer.transform(data) return self.model.predict(t_data) * -1 if __name__ == "__main__": print(os.getcwd()) MODEL_ = FactFeelRegressor() FE_ = FeatureExtractor()
def extract_features(filepath): feature_extractor = FeatureExtractor( 'config_files/feature_extraction.json') return feature_extractor.extract_features(filepath)
@author: James """ import os import json from flask import Flask, render_template, request, jsonify import logging from model.Fact_Feel_Regression import FactFeelRegressor from feature_extraction.feature_extractor import FeatureExtractor log = logging.getLogger("FactFeel_log.txt") ff_model = FactFeelRegressor() feat_extractor = FeatureExtractor() app = Flask(__name__) app.config['JSONIFY_PRETTYPRINT_REGULAR'] = False #class NumpyArrayEncoder(JSONEncoder): # def default(self, obj): # if isinstance(obj, numpy.ndarray): # return obj.tolist() # return JSONEncoder.default(self, obj) @app.route("/") def home(): return render_template("home.html",prediction="2.56") @app.route("/predict",methods=["POST"])