def preProcessTweets(): processedTweets = [] with open("E:/twitter10k.csv", newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in reader: try: #print(counter) #print(row) tweet = row[-2] #print(tweet) #print('############') preProcessor = preprocessor.Preprocessor() processedTweet = preProcessor.preprocess_text(tweet) #print(processedTweet) for word in processedTweet: if (word != 'http' and word != 'get' and word != 'is' and word != 'ny' and word != 'lol' and word != 'lol' and word != 'na' and word != 'u' and word != '-' and word != 'us' and word != 'im'): processedTweets.append(word) #print('done processing ############') except Exception as e: i = 1 #print(str(e)) return processedTweets
def __init__(self, image_file_paths, mean_image=None, bandstats_file_path=None, original_size_x=650, original_size_y=650, input_size=256, slice_count_x=1, slice_count_y=1, is_8_channel=True): super().__init__() # logger.info("Creating image list dataset from {} images".format(str(len(image_file_paths)))) self.preprocessor = preprocessor.Preprocessor( datapath=None, original_size_x=original_size_x, original_size_y=original_size_y, input_size=input_size, slice_count_x=slice_count_x, slice_count_y=slice_count_y, is_8_channel=is_8_channel) # Change location of bandstats file, it will not figure out on its own. self.preprocessor.path_mgr.bandstats_file = bandstats_file_path self.image_file_paths = image_file_paths self.slice_count = slice_count_x * slice_count_y self.current_image_path = "" self.is_8_channel = is_8_channel self.preloaded_slices = {} # TODO(martun): later change mean substraction as a transformation. self.mean_image = mean_image
def voodooOneFile(fullName, inputPath, fileList): fullOutput = fullOutputName(fullName, inputPath) mkdirOf(fullOutput) output = '' try: output += voodoo.voodoo(input=fullName, output=fullOutput, pathToRemoveFromIdentifier=inputPath, voodooDBFile=args.voodooDB, includes=args.includePath, defines=args.define, trace=False, preIncludes=args.preInclude) state = "V" except Exception, e: if str(e).find("all argume") != -1: raise inputLines = voodoo._readLinesOfFile(fullName) prepro = preprocessor.Preprocessor(fullName, fullOutput, inputLines, inputPath) output += prepro.intercepter() output += "\n/* The error that forced interception:\n" + \ str( e ).replace( "*/", "* /" ) + "\n" output += "\n" output += "Voodoo stack trace:\n" + traceback.format_exc() output += "*/\n" output += "\n" state = "I"
def main(): pp = preprocessor.Preprocessor() csvs = [] csvs.extend(pp.do_udc()) csvs.extend(pp.do_cmdc()) csvs.extend(pp.do_wiki()) csvs.extend(pp.do_bdc()) lexicon.Lexicon(csvs)
def preprocessor(argv): mp3_path = argv[2] book_path = argv[3] print("You run preprocessor.") print("Path to mp3: " + mp3_path) print("Path to book: " + book_path) preprocessor = pr.Preprocessor(mp3_path, book_path, PREPROCESSOR_PATH) preprocessor.preprocess()
def set_preprocessor(self, preprocessor_): if preprocessor_ is None: preprocessor_ = [preprocessor.Preprocessor()] elif type(preprocessor_) is not list: preprocessor_ = [preprocessor_] self.preprocessors = preprocessor_ Xtrain, ytrain = self.split(self.training_data, self.target_column) for pp in self.preprocessors: pp.fit(Xtrain) Xtrain = self._preprocess_one(Xtrain, pp)
def __init__(self, nominal_src, nominal_file, conditions_ls): self.src = nominal_src self.input_name = nominal_file self.nominal_model = preprocessor.Preprocessor(nominal_src + nominal_file) self.nominal_model.clean_input() self.nominal_model.generate_species_classes() self.Temp_ls = [conditions_ls[0]] # should be a list self.Pres_ls = [conditions_ls[1]] # should be a list self.Energy_grid = conditions_ls[2] # should be a float self.new_ne_file = []
def setUp(self): # initial runtime environment args = {"config_file": "../config_omniphotos.yaml"} self.preprocessor = preprocessor.Preprocessor(args) self.preprocessor.root_dir = \ pathlib.Path("D:/workdata/testDatasets/circular/KyotoShrines_test") self.preprocessor.image_output_path= \ pathlib.Path("D:/workdata/testDatasets/circular/KyotoShrines_test/Input") self.preprocessor.FPS = 50 self.preprocessor.omniphotos_config_template_path = \ "D:/workspace/Python/preprocessing/template/config.yaml.template"
def main(): parser = argparse.ArgumentParser() parser.add_argument("--project_dir", type=str, required=True) parser.add_argument("--report_dir", type=str, required=True) parser.add_argument("--num_files_to_print", type=int, required=False, default=20) args = parser.parse_args() project_dir = args.project_dir report_dir = args.report_dir num_files_to_print = args.num_files_to_print project_dir = "../data/ZXing" report_dir = "../data/ZXing/ZXingBugRepository.xml" project_report_info = preprocessor.Preprocessor(project_dir, report_dir) similarity_info = similarity_calculator.SimilarityCalculator( project_report_info.project_frequency_dict, project_report_info.report_frequency_dict, project_report_info.xml_report, num_files_to_print) project_dir = "../data/Rhino" report_dir = "../data/Rhino/RhinoBugRepository.xml" project_report_info = preprocessor.Preprocessor(project_dir, report_dir) similarity_info = similarity_calculator.SimilarityCalculator( project_report_info.project_frequency_dict, project_report_info.report_frequency_dict, project_report_info.xml_report, num_files_to_print) project_dir = "../data/JodaTime/" report_dir = "../data/JodaTime/JodaTimeBugRepository.xml" project_report_info = preprocessor.Preprocessor(project_dir, report_dir) similarity_info = similarity_calculator.SimilarityCalculator( project_report_info.project_frequency_dict, project_report_info.report_frequency_dict, project_report_info.xml_report, num_files_to_print)
def preprocess_input(document, lower=True, remove_punctuation=False, remove_stop_words=False): preprocessor = pp.Preprocessor() if lower: document = document.lower() if remove_punctuation: document = preprocessor.remove_punctuation(document) if remove_stop_words: document = preprocessor.remove_stop_words(document, german=True, english=True) return document
def make_dataframe(self): test_df = pd.read_csv(self.test_path, names=['x', 'y']) test_data = preprocessor.Preprocessor(test_df, self.model.vocab) test_data.tokenize('x') test_data.add_tags('x') test_data.lemmatize('x') test_data.update_dataframe('x', 'y') test_data.data['pos_score'], test_data.data[ 'neg_score'], test_data.data['likelihood_pos'] = ( self.model.predict(test_data.data)) return test_data.data
def __init__(self, input_path, nominal_file, perturb_dict, nominal_dict, abstraction=False): self.input_path = input_path self.input_name = nominal_file self.nominal_model = preprocessor.Preprocessor(input_path + nominal_file) self.nominal_model.clean_input() self.nominal_model.generate_species_classes(abstraction=abstraction) self.perturb_dict = perturb_dict self.nominal_dict = nominal_dict self.abstraction = abstraction
def main(): prep = preprocessor.Preprocessor() # BUG: raw_img_data = prep.read_img_jpeg_bytes("./pedestrain.jpg") warm_up(prep) server_addr = "/tmp/coin_dl_server" client_addr = "/tmp/coin_dl_client" for addr in [client_addr, server_addr]: try: os.remove(addr) except OSError: pass main_loop(server_addr, client_addr, prep, raw_img_data)
def warm_up(self, det, mode): self.logger.info("Warm-up the detector") start = time.time() raw_img_data = det.read_img_jpeg_bytes("./pedestrain.jpg") if mode == "raw": # Warm up the session, first time inference is slow ret = det.inference(raw_img_data) ret = det.get_detection_results(*ret) elif mode == "preprocessed": prep = preprocessor.Preprocessor() compressed_img_data = prep.inference(raw_img_data, 70) ret = det.inference(compressed_img_data) ret = det.get_detection_results(*ret) duration = time.time() - start self.logger.info( f"Warm-up the mode {mode} finished! Takes {duration} seconds")
def generate_files(): check_nltk_resources() check_paths() p = preprocessor.Preprocessor() df = p.load_dataset() p.preprocess_synopses(df) p.preprocess_genres(df) p.build_indexes() if settings.USE_W2V: p.generate_embedding_weights() p.filter_dataset() p.encode_genres() p.encode_synopses() p.save_data()
def get_predictions(g, n): possible_genres = list(g.mlb.classes_) print("Possible film genres: ", ','.join(possible_genres)) input_line = 'r' #input("Insert a comma separated set of genres (r for random, q for quit): ") if input_line == 'q': exit() randomly = input_line == 'r' p = preprocessor.Preprocessor() if randomly: n_genres = random.randint(1, 6) input_genres = random.sample(possible_genres, n_genres) else: input_genres = input_line.split(',') for ig in input_genres: if ig not in possible_genres: print(ig + " is not a possible genre") get_predictions(g, n) print("Input genres: ", ', '.join(input_genres)) encoded_genres = g.mlb.transform([input_genres]) mode = input("Input g or b for greedy or beam search mode: ") previous_words = input("Introduce help/previous words (optional): ") previous_words = p.clean_text(previous_words) previous_words = p.tokenize(previous_words)[:-1] prvs = [] for pw in previous_words: if pw in g.word_to_index.keys(): prvs.append(pw) else: prvs.append(settings.UNKNOWN_TOKEN) if previous_words == '': previous_words = None print("Starting words: " + str(previous_words)) if mode == 'g': print("Greedy search mode") syn = get_predictions_greedy(g, n, encoded_genres, previous_words) elif mode == 'b': print("Beam search mode") syn = get_predictions_beam(g=g, n=n, encoded_genres=encoded_genres, previous_words=previous_words) else: print("Wrong mode") get_predictions(g, n) print("Synopsis: ", syn) get_predictions(g, n)
class FramePipeline: cam = cm.Camera() preprocessor = prep.Preprocessor() homographyOp = h**o.Homography() laneLinesFinder = None currOriginalFrame = None visualizer = None def __init__(self, frameWidth, frameHeight): self.frameWidth = frameWidth self.frameHeight = frameHeight self.cam.init(9, 6, 'camera_cal/calibration*.jpg') self.cam.calibrate() self.homographyOp.setFrameSize(frameWidth, frameHeight) self.homographyOp.estimateRoadHomography() self.laneLinesFinder = lf.LaneLinesFinder(frameWidth, frameHeight) self.visualizer = visu.Visualizer(self.laneLinesFinder, self) def processFrame(self, InputImg): self.currOriginalFrame = InputImg undistortedImg = self.cam.undistortImg(InputImg) sobelImg = self.preprocessor.extractEdges(undistortedImg, 'all') croppedImg = self.preprocessor.crop(sobelImg) rectImg = self.homographyOp.warp(croppedImg) warped_out = self.laneLinesFinder.findLane(rectImg) output = self.visualizer.visualizeFrame(rectImg) #only for the report at the end #cv2.imwrite('afterUndist.jpg', undistortedImg) #cv2.imwrite('afterSobel.jpg', sobelImg) #cv2.imwrite('afterCropping.jpg', croppedImg) #cv2.imwrite('afterRectifying.jpg', rectImg) #cv2.imwrite('afterFitting.jpg', warped_out) #cv2.imwrite('afterWarpingBack.jpg', output) #cv2.waitKey() return output
def __init__(self, methodName): super().__init__(methodName) self.preprocessor = preprocessor.Preprocessor() column_names = ['label', 'text'] data_to_process = { 'label': ['ham', 'ham', 'spam'], 'text': [ 'Not normalized #$text', ' AnOthEr not normalized TEXT', 'Not normalized SPAM' ], } self.dataset_to_process = pd.DataFrame(data_to_process, columns=column_names) correct_data = { 'label': [0, 0, 1], 'text': ['normalized text', 'another normalized text', 'normalized spam'], } self.correct_dataset = pd.DataFrame(correct_data, columns=column_names)
def test2(): ''' =========================================================== Following steps: 1. Add numbers and punct 2. No tokenizer 3. Stop words 4. pos tag =========================================================== ''' print('running preprocessor test 2 ...') pattern = re.compile(r'[^а-яА-я0-9,.!?;\- ё]') analyzer = maru.get_analyzer(tagger='linear') config = preprocessor.Config(regexp=pattern, stopwords=stopwords_set, analyzer=analyzer, with_pos_tag=True, remove_stop_words=False, lemmatize=True, tokenizer=None) pipeline = preprocessor.Preprocessor(config) case = [ 'Так говорила в июле 1805 года известная', '— Как можно быть здоровой... когда нравственно страдаешь?', 'праздник отменен, Je vous avoue que toutes ces fêtes' ] expected = [[ 'так_ADV', 'говорить_VERB', 'в_ADP', 'июль_NOUN', '1805_NUM', 'год_NOUN', 'известный_ADJ' ], [ 'как_CONJ', 'можно_ADJ', 'быть_VERB', 'здоровой..._CONJ', 'когда_CONJ', 'нравственно_ADV', 'страдаешь?_PRON' ], ['праздник_NOUN', 'отменен,_VERB']] res = pipeline.fit(case).transform(case) for res_line, expected_line in zip(res, expected): assert compare(res_line, expected_line), \ 'failed with {} and {}'.format(res_line, expected_line) print('test 2 passed')
def test4(): ''' =========================================================== Following steps: 1. Add numbers and punct 2. Razdel tokenizer 3. Stop words 4. No pos tag 5. No lemmatization =========================================================== ''' print('running preprocessor test 3 ...') pattern = re.compile(r'[^а-яА-я0-9,.!?;\- ё]') analyzer = maru.get_analyzer(tagger='linear') tokenizer = razdel config = preprocessor.Config(regexp=pattern, stopwords=stopwords_set, analyzer=analyzer, with_pos_tag=False, remove_stop_words=False, lemmatize=False, tokenizer=tokenizer) pipeline = preprocessor.Preprocessor(config) case = [ 'Так говорила в июле 1805 года известная', '— Как можно быть здоровой... когда нравственно страдаешь?', 'праздник отменен, Je vous avoue que toutes ces fêtes' ] expected = [[ 'так', 'говорила', 'в', 'июле', '1805', 'года', 'известная' ], [ 'как', 'можно', 'быть', 'здоровой', '...', 'когда', 'нравственно', 'страдаешь', '?' ], ['праздник', 'отменен', ',']] res = pipeline.fit(case).transform(case) for res_line, expected_line in zip(res, expected): assert compare(res_line, expected_line), \ 'failed with {} and {}'.format(res_line, expected_line) print('test 4 passed')
def test(): raw_img_data = Detector.read_img_jpeg_bytes("./pedestrain.jpg") det = Detector(mode="raw") ret = det.inference(raw_img_data) resp = det.get_detection_results(*ret) print("*** Inference result of raw image!") print(resp) del det gc.collect(1) gc.collect(2) # Test detection of preprocessed image prep = preprocessor.Preprocessor() compressed_img_data = prep.inference(raw_img_data, 70) print( f"*** Raw image size: {len(raw_img_data)}B, preprocessed image size: {len(compressed_img_data)}B" ) det = Detector(mode="preprocessed") ret = det.inference(compressed_img_data) resp_prep = det.get_detection_results(*ret) print("*** Inference result of preprocessed image!") print(resp_prep)
def doc2vec(self, sentences): fname = get_tmpfile('doc2vec.model') edited_sentences = {} train_corpus = [] count = 0 for index, sentence in sentences.items(): processed_sentence = preprocessor.Preprocessor( sentence).preprocessData() if not processed_sentence: continue else: tokens = gensim.utils.simple_preprocess(processed_sentence) train_corpus.append(TaggedDocument(tokens, str(count))) edited_sentences[count] = sentence count = count + 1 model = Doc2Vec(train_corpus, vector_size=10, dbow_words=1, dm=1, window=2, min_count=2) return (model, train_corpus, edited_sentences)
def test1(): ''' =========================================================== Full house: 1. Leave only alphabet characters 2. Remove stop words 3. Lemmatize and add pos tags =========================================================== ''' print('running preprocessor test 1 ...') pattern = re.compile(r'[^а-яА-я ё]') analyzer = maru.get_analyzer(tagger='linear') config = preprocessor.Config(regexp=pattern, stopwords=stopwords_set, analyzer=analyzer, with_pos_tag=True, remove_stop_words=True, lemmatize=True, tokenizer=None) pipeline = preprocessor.Preprocessor(config) case = [ 'Так говорила в июле 1805 года известная', '— Как можно быть здоровой... когда нравственно страдаешь?', 'праздник отменен, Je vous avoue que toutes ces fêtes' ] expected = [ ['говорить_VERB', 'июль_NOUN', 'год_NOUN', 'известный_ADJ'], ['здоровый_ADJ', 'нравственно_ADV', 'страдать_VERB'], ### here is the case when lemmatization fails ['праздник_NOUN', 'отменный_ADJ'] ] res = pipeline.fit(case).transform(case) for res_line, expected_line in zip(res, expected): assert compare(res_line, expected_line), \ 'failed with {} and {}'.format(res_line, expected_line) print('test 1 passed')
def getNews(self): links = self.getGoogleLinks() print(len(links)) news = {} for item in range(self.number): # Get the text of article date = int(links[item][0]) news[date] = {} link = links[item][1] article = requests.get(link) soup = BeautifulSoup(article.text, "html.parser") for script in soup(["script", "style", "meta", "noscript"]): script.extract() # rip it out text = soup.get_text() # Get the source source_1 = re.search('\.\\s*([^.]*)', link).group(1) source_2 = re.search('//\\s*([^.]*)', link).group(1) if "/" in source_1: source = source_2 else: source = source_1 news[date]['source'] = source news[date]['text'] = preprocessor.Preprocessor(text).preprocessData() return news
try: output += voodoo.voodooExpectSource( input=fullName, output=fullOutput, pathToRemoveFromIdentifier=inputPath, voodooDBFile=args.voodooDB, includes=args.includePath, defines=args.define, trace=False, preIncludes=args.preInclude) state = "V" except Exception, e: if str(e).find("all argume") != -1: raise inputLines = voodoo._readLinesOfFile(fullName) prepro = preprocessor.Preprocessor(fullName, fullOutput, inputLines, inputPath) output += prepro.intercepter() output += "\n/* The error that forced interception:\n" + \ str( e ).replace( "*/", "* /" ) + "\n" output += "\n" output += "Voodoo stack trace:\n" + traceback.format_exc() output += "*/\n" output += "\n" state = "I" f = file(fullOutput, "w") f.write(output) f.flush() f.close() sys.stdout.write(" <%d/%d> %s %s\n" % (1 + fileList.index( (fullName, inputPath)), len(fileList), state, fullOutput))
import csv import matplotlib.pyplot as plt import pandas as pd from keras.layers import Dense from keras.optimizers import Adam from keras.models import Sequential from keras.callbacks import EarlyStopping import preprocessor import featureanalysis # Above this value, the survival flag will be true PROBABILITY_MARGIN_SURVIVAL = 0.5 prepr = preprocessor.Preprocessor() prepr.process_training_dataset('train.csv') df = pd.read_csv('train.csv') # perform feature analysis numerical_features = ["Survived", "SibSp", "Parch", "Age", "Fare"] feat_analysis = featureanalysis.FeatureAnalysis(df) feat_analysis.get_correlation_numerical_values(numerical_features) # removed cabin and name columns input_value, output = prepr.get_train_datasets() # Get number of columns in training data n_cols = input_value.shape[1]
import sys import preprocessor CMD_OVERWRITE_OPTION = '-ow' if __name__ == "__main__": # Check arguments if len(sys.argv) >= 3: # At least 2 arguments have been passed inp = sys.argv[1] out = sys.argv[2] # Define overwrite option overwrite = len(sys.argv) >= 4 and sys.argv[3] == CMD_OVERWRITE_OPTION p = preprocessor.Preprocessor(preprocessor.Language.vietnamese) try: p.preprocess_files(inp, out, {'overwrite': overwrite}) except (FileNotFoundError, FileExistsError) as errors: for e in errors.args: if e: print(e) else: print('Missing arguments. Arguments: input output [-ow]')
from extractor import Extractor import preprocessor import dictionary import vectorizer import dataset_divider import classifier import time start = time.time() PreProcessor = preprocessor.Preprocessor() Dictionary = dictionary.Dictionary() categories = [1, 2, 3] # Categories to be includes lines = [] for category in categories: lines.append( Extractor.extract(('flashback' + str(category) + '.json'), ('extracted' + str(category) + '.txt'))) dataset_divider.Divider.divide(('extracted' + str(category) + '.txt'), lines[len(lines) - 1]) # pre-processing of training data processed = [] processed_test = [] for category in categories: processed.append( PreProcessor.preprocess('training' + str(category) + ".txt")) processed_test.append( PreProcessor.preprocess('testing' + str(category) + ".txt")) with open("testingposts.txt", "w") as file:
import argparse import preprocessor from definitions import TEST_PROCESSED_PATH, TRAIN_PROCESSED_PATH def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('-t', '--train', action='store_true') return parser.parse_args() if __name__ == '__main__': args = parse_args() preprocess = preprocessor.Preprocessor(train=args.train, dl=False) preprocess_data = preprocess.clean_data() path = TRAIN_PROCESSED_PATH if args.train else TEST_PROCESSED_PATH preprocess_data.to_csv(path, encoding='utf-8', index=False)
def main(): """Main method for controlling the flow of the stylometric analyser. Function for creating of objects for word, character, punctuation, word length etc analysis.= to determine the patterns of styles in different works. """ #Column names colnames = ['work', 'char_freq', 'punc_freq', 'stop_freq', 'word_len_freq'] #Initializing an empty dataframe to store all stats after analysis all_text_stats = pd.DataFrame(columns=colnames) #Try block try: #-----------------------------Analysis---------------------------------- #Main loop for doing the analysis file by file for work in works: #calling read_input function to read the content of each file content = read_input(work) #Creating object for preprocessor class pre_processor = prpscr.Preprocessor() pre_processor.tokenise(content) #Fetching the tokens tokens = pre_processor.get_tokenised_list() #Creating object for CharacterAnalyser class char_analyser = char.CharacterAnalyser() #Analysing at character level char_analyser.analyse_characters(tokens) #Fetching the character occurences ch_occ = char_analyser.char_occ #Fetching the punctuation occurences punc_occ = char_analyser.get_punctuation_frequency() #Creating object for WordAnalyser class word_analyser = word.WordAnalyser() #Analysing at word level word_analyser.analyse_words(tokens) #Fetching the stop word occurences stop_occ = word_analyser.get_stopword_frequency() #Fetching the word length occurences word_len_occ = word_analyser.get_word_length_frequency() #Temporary df to store all the analysis for one text at a time temp_df = pd.DataFrame( [[work, ch_occ, punc_occ, stop_occ, word_len_occ]], columns=colnames) all_text_stats = all_text_stats.append(temp_df, ignore_index=True) #-----------------------------Visualisation----------------------------- #Creating object for Visualiser class visualiser = vis.AnalysisVisualiser(all_text_stats) #Visualising punctuation frequencies in all the works visualiser.visualise_punctuation_frequency() #Visualising character frequencies in all the works visualiser.visualise_character_frequency() #Visualising stopword frequencies in all the works visualiser.visualise_stopword_frequency() #Visualising word length frequencies in all the works visualiser.visualise_word_length_frequency() #Catch for exceptions except ImportError as err: print( 'IMPORT ERROR :', err, '. Please check the working directory, name or ' + 'make sure that module is imported!') except TypeError as err: print('TYPE ERROR :', err) except IndexError as err: print('INDEX ERROR :', err) except ValueError as err: print('VALUE ERROR :', err) except IOError as err: print('INPUT ERROR :', err, '. Please check the path of the file!') except requests.RequestException as err: print('REQUEST ERROR :', err) except: print('UNEXPECTED ERROR!')