def test_process_labels_target_five_labels(self): input = cleandoc(''' jump :label1 notEqual @unit null label1: label2: jump :label2 notEqual @unit null ubind @mono label3: jump :label5 notEqual @unit null label4: end jump :label4 notEqual @unit null label5: jump :label3 notEqual @unit null ''') expected = cleandoc(''' jump 2 notEqual @unit null jump 2 notEqual @unit null ubind @mono jump 7 notEqual @unit null end jump 5 notEqual @unit null jump 4 notEqual @unit null ''') uut = Preprocessor(input) uut.process_labels() actual = uut.result self.assertEqual(expected, actual)
def __init__(self, args): self.args = args self.args.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.model_param_fname = os.path.join( "weights", f"best_model_{self.args.pretrained_model}_{self.args.cnn_mode}.pt") self.preprocessor = Preprocessor(self.args)
def test_process_labels(self): input = CUSTOM_SIMPLE expected = STANDARD_SIMPLE uut = Preprocessor(input) uut.process_labels() actual = uut.result self.assertEqual(expected, actual)
def test_process_labels_target_does_not_exist(self): input = cleandoc(''' jump :label1 notEqual @unit null ubind @mono end ''') uut = Preprocessor(input) with self.assertRaises(ValueError): uut.process_labels()
def test_process_labels_bad_target_error_line_number_simple_program(self): input = cleandoc(''' jump :label1 notEqual @unit null ubind @mono end ''') uut = Preprocessor(input) with self.assertRaisesRegex(ValueError, r'Line 1'): uut.process_labels()
def preprocess(p_src: str) -> str: global tok_debug global parse_debug global env_debug pps = Preprocessor() src, flags = pps.scan(p_src) tok_debug = flags["tok_debug"] parse_debug = flags["parse_debug"] env_debug = flags["env_debug"] return src
def test_process_labels_unreferenced_target(self): input = cleandoc(''' jump 3 notEqual @unit null ubind @mono label1: end ''') expected = STANDARD_SIMPLE uut = Preprocessor(input) uut.process_labels() actual = uut.result self.assertEqual(expected, actual)
def test_color2bw(self): ''' Loads the main input image to convert it to scale of gray. ''' img_test = cv2.imread('test/img/img.tif') dimensions = Preprocessor.color2bw(self, img_test) self.assertEqual(np.shape(dimensions), (5000, 5000))
def test_scale_images(self): ''' Loads the 4 pieces of the total image and with dimensions (2500, 2500), eliminates one dimension because imread loads with 3 dimensions even if it is in grayscale. ''' # Tiene que leer las 4 imágenes que entran img_test_1 = cv2.imread('test/img/test_crop_image_1.png', cv2.IMREAD_GRAYSCALE) img_test_2 = cv2.imread('test/img/test_crop_image_2.png', cv2.IMREAD_GRAYSCALE) img_test_3 = cv2.imread('test/img/test_crop_image_3.png', cv2.IMREAD_GRAYSCALE) img_test_4 = cv2.imread('test/img/test_crop_image_4.png', cv2.IMREAD_GRAYSCALE) img_test_list = [img_test_1, img_test_2, img_test_3, img_test_4] scaled = Preprocessor.scale_images(self, img_test_list) self.assertEqual(len(scaled), 4) self.assertEqual(np.shape(scaled[0]), (512, 512)) self.assertEqual(np.shape(scaled[1]), (512, 512)) self.assertEqual(np.shape(scaled[2]), (512, 512)) self.assertEqual(np.shape(scaled[3]), (512, 512))
def main(config): scanner = Scanner(config) preprocessor = Preprocessor(config) utils = Utils(config) scanner.scan(front=True) if config.manual_duplex: print('rotate pages and press enter') input() scanner.scan(front=False) pages = scanner.get_pages() preprocessor.process(pages) exporter = Exporter(config) exporter.save_as_pdf(pages) if utils.show_preview(): exporter.upload_doc() utils.clean_up(pages)
def test_process_labels_consecutive_targets(self): input = cleandoc(''' jump :label2 notEqual @unit null ubind @mono label1: label2: label3: end ''') expected = STANDARD_SIMPLE uut = Preprocessor(input) uut.process_labels() actual = uut.result self.assertEqual(expected, actual)
class Predictor: def __init__(self, args): self.args = args self.args.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.model_param_fname = os.path.join( "weights", f"best_model_{self.args.pretrained_model}_{self.args.cnn_mode}.pt") self.preprocessor = Preprocessor(self.args) def set_seed(self): random.seed(self.args.seed) np.random.seed(self.args.seed) torch.manual_seed(self.args.seed) # torch.backend.cudnn.deterministic = True def get_vocab_info(self): self.vocab_size = len(self.TEXT.vocab) self.pad_index = self.TEXT.vocab.stoi['<pad>'] self.unk_index = self.TEXT.vocab.stoi['<unk>'] print( f"input dim: {self.vocab_size}\npad index: {self.pad_index}\nunk index: {self.unk_index}" ) def run(self): self.set_seed() reviews, word2index = self.preprocessor.build_data() processed_csv_fname = os.path.join(self.args.data_path, self.args.processed_csv) self.TEXT, self.train_iterator, self.valid_iterator = data_load_without_cv( processed_csv_fname, self.args) self.get_vocab_info() word2vec_index, word2vec_vector = load_pretrained_word2vec( self.args.data_path, self.TEXT) self.TEXT.vocab.set_vectors( word2vec_index, torch.from_numpy(word2vec_vector).float().to(self.args.device), self.args.embedding_dim) self.model = PolarCNN(self.vocab_size, self.pad_index, self.args).to(self.args.device) self.critierion = torch.nn.BCEWithLogitsLoss().to(self.args.device) # for prediction if os.path.isfile(self.model_param_fname): self.model.load_state_dict(torch.load(self.model_param_fname)) probability, predicted_label = predict(self.TEXT, self.args, self.model) print_predict_log(self.args, probability, predicted_label) else: print( f"model parameter file {self.model_param_fname} was not exist") print(f"for sentence prediction, please train your model first")
def test_process_labels_bad_target_error_line_number_between_other_targets(self): input = cleandoc(''' jump :label1 notEqual @unit null label1: label2: jump :label2 notEqual @unit null ubind @mono label3: jump :label5 notEqual @unit null end jump :label4 notEqual @unit null label5: jump :label3 notEqual @unit null ''') uut = Preprocessor(input) with self.assertRaisesRegex(ValueError, r'Line 9'): uut.process_labels()
def preprocess_pipeline(): """Read in raw lexicon and create preprocessed version.""" for language, n in config.LANGUAGES_N: print("Preprocessing lexica for: {language}".format(language=language)) if not os.path.exists("data/processed/{lan}".format(lan=language)): print("Creating directory: data/processed/{lan}".format( lan=language)) os.mkdir("data/processed/{lan}".format(lan=language)) if not os.path.exists( "data/processed/{lan}/reals".format(lan=language)): print("Creating directory: data/processed/{lan}/reals".format( lan=language)) os.mkdir("data/processed/{lan}/reals".format(lan=language)) config_dict = get_config_dict(config, language) ## Reset n according to language config_dict['n'] = n preprocessor = Preprocessor(**config_dict) info_for_generation = preprocessor.preprocess_lexicon() print("Now getting minimal pairs") preprocessor.get_minimal_pairs()
def test__tokenise_label_len_matches_tokens(self): # Arrange x = [ "Comparison with", "alkaline phosphatases", "and", " ", " ", "5-nucleotidase" ] y = ["o", "s", "o", "o", "o", "s"] tokensier = MagicMock() tokensier.tokenize.side_effect = lambda z: list( filter(lambda y: y != "", z.split(" "))) label_mapper = MagicMock() label_mapper.entity_labels = ["s"] label_mapper.continuation_symbol = {"s": "sc"} label_mapper.other_label = "o" sut = Preprocessor(max_feature_len=5, tokeniser=tokensier, label_mapper=label_mapper) sut._x = x sut._y = y # Act sut._tokenise() # Assert self.assertEqual(len(sut._x), len(sut._y))
def test__tokenise(self): # Arrange x = [ "Comparison with", "alkaline phosphatases", "and", "5-nucleotidase" ] y = ["o", "s", "o", "s"] expected_x = [ "Comparison", "with", "alkaline", "phosphatases", "and", "5-nucleotidase" ] expected_y = ["o", "o", "s", "sc", "o", "s"] tokensier = MagicMock() tokensier.tokenize.side_effect = lambda x: x.split(" ") label_mapper = MagicMock() label_mapper.entity_labels = ["s"] label_mapper.continuation_symbol = {"s": "sc"} label_mapper.other_label = "o" sut = Preprocessor(max_feature_len=5, tokeniser=tokensier, label_mapper=label_mapper) sut._x = x sut._y = y # Act sut._tokenise() # Assert self.assertSequenceEqual(expected_x, sut._x) self.assertSequenceEqual(expected_y, sut._y)
def test__to_label_to_index(self): """Make sure label to index for y works""" # Arrange x = [ "[CLS]" "Comparison", "with", "alkaline", "phosphatases", "and", "5-nucleotidase", "[PAD]" ] y = ["[PAD]", "o", "o", "s", "sc", "o", "s", "[PAD]"] labels = ["s", "sc", "o"] fake_labels = ["[PAD]"] + ["s", "sc", "o"] expected_y = [fake_labels.index(x) - 1 for x in y] tokensier = MagicMock() tokensier.tokenize.side_effect = lambda x: x.split(" ") label_mapper = MagicMock() label_mapper.entity_labels = ["s"] label_mapper.continuation_symbol = {"s": "sc"} label_mapper.other_label = "o" label_mapper.label_to_index = lambda x: labels.index(x) sut = Preprocessor(max_feature_len=5, tokeniser=tokensier, label_mapper=label_mapper) sut._x = x sut._y = y # Act sut._to_label_index() # Assert self.assertSequenceEqual(expected_y, sut._y)
def main(): train = pd.read_csv('../data/train.csv') test = pd.read_csv('../data/test.csv') X = train[selected_fields] X[encoding_fields].astype('category', copy=False) y = train['SalePrice'] preprocessor = Preprocessor() preprocessor.train(X) X = preprocessor.transform(X) y = np.array(y) clf = RandomForestRegressor() score = cross_val_score(clf, X, y, cv=5) print(score) X_test = test[selected_fields] X_test[encoding_fields].astype('category', copy=False) X_test = preprocessor.transform(X_test) clf.fit(X, y) y_pred = clf.predict(X_test) result = pd.DataFrame({ 'SalePrice': y_pred }, index=test['Id']) result.to_csv('output.csv')
def test_process_labels_target_two_labels(self): input = cleandoc(''' label1: jump :label2 notEqual @unit null ubind @mono label2: jump :label1 notEqual @unit null end ''') expected = cleandoc(''' jump 3 notEqual @unit null ubind @mono jump 1 notEqual @unit null end ''') uut = Preprocessor(input) uut.process_labels() actual = uut.result self.assertEqual(expected, actual)
def test_process_labels_empty_lines(self): input = cleandoc(''' jump :label1 notEqual @unit null ubind @mono label1: end ''') expected = cleandoc(''' jump 4 notEqual @unit null ubind @mono end ''') uut = Preprocessor(input) uut.process_labels() actual = uut.result self.assertEqual(expected, actual)
def _get_words(path): reader = EpubReader(path) book_text = reader.get_text() book_text = Preprocessor.process(book_text) words_to_occurrences = dict() for word in book_text: if word not in words_to_occurrences: words_to_occurrences[word] = Word(word) words_to_occurrences[word].add_occurrence() words_and_occurrences = list(words_to_occurrences.values()) words_and_occurrences.sort(key=lambda elem: elem.occurrences, reverse=True) return words_and_occurrences
def test_crop_image(self): ''' Load an image from the test image library into grey scale and pass it to the crop_image function which returns an array composed of 4 images of (2500, 2500). The test checks the result by validating the dimensions and finally that the array is composed of 4 images. ''' img_test = cv2.imread('test/img/test_color_gray.png', cv2.IMREAD_GRAYSCALE) cropped = Preprocessor.crop_image(self, img_test) self.assertEqual(np.shape(cropped[0]), (2500, 2500)) self.assertEqual(np.shape(cropped[1]), (2500, 2500)) self.assertEqual(np.shape(cropped[2]), (2500, 2500)) self.assertEqual(np.shape(cropped[3]), (2500, 2500)) self.assertEqual(len(cropped), 4)
def heldout_surprisal(): """Calculate heldout surprisal""" for language, n in config.LANGUAGES_N: print("Calculating held-out surprisal for: {language}".format( language=language)) ## Get config dict config_dict = get_config_dict(config, language) PHON_COLUMN = config_dict['phon_column'] ## Load real lexicon LOAD_PATH = "data/processed/{lan1}/reals/{lan2}_with_mps_{n}phone.csv".format( lan1=language, lan2=language, n=n) SAVE_PATH = "data/processed/{lan1}/reals/{lan2}_with_mps_{n}phone_holdout.csv".format( lan1=language, lan2=language, n=n) df_lexicon = pd.read_csv(LOAD_PATH) print(len(df_lexicon)) # Get heldout surprisal print("Calculating heldout surprisal...") NUM_FOLDS = 1000 print("Number of folds: {x}".format(x=NUM_FOLDS)) df_real_heldout = Preprocessor.calculate_heldout_surprisal( df_lexicon[PHON_COLUMN].values, n=n, num_folds=NUM_FOLDS) df_real_heldout[PHON_COLUMN] = df_real_heldout['word'] df_real_heldout = df_real_heldout[[ PHON_COLUMN, 'heldout_log_prob', 'heldout_surprisal' ]] print(len(df_real_heldout)) # Merge with real processed lexicon df_merged = pd.merge(df_lexicon, df_real_heldout, on=PHON_COLUMN) print(len(df_merged)) print("Saving to: {path}".format(path=SAVE_PATH)) df_merged.to_csv(SAVE_PATH)
def test__call__no_label_runs_without_exceptions(self): x = [ "Comparison with", "alkaline phosphatases", "and", "5-nucleotidase" ] labels = ["s", "sc", "o"] tokensier = MagicMock() tokensier.tokenize.side_effect = lambda x: x.split(" ") label_mapper = MagicMock() label_mapper.entity_labels = ["s"] label_mapper.continuation_symbol = {"s": "sc"} label_mapper.other_label = "o" label_mapper.label_to_index = lambda x: labels.index(x) sut = Preprocessor(max_feature_len=5, tokeniser=tokensier, label_mapper=label_mapper) # Act x, y = sut(x) # Assert self.assertIsNone(y)
from sklearn.linear_model import Perceptron from src.preprocessor import Preprocessor from random import sample from src.classifier import Classifier import src.conf as conf import numpy as np preprocess = False if preprocess: dataset_pickle = open(conf.project_path + 'data\dataset_cleared.pickle', 'rb') dataset = pickle.load(dataset_pickle) dataset_pickle.close() preprocessor = Preprocessor() dataset = list(map(lambda x: (preprocessor.preprocess(x[0]), x[1]), dataset)) # preprocess dataset preprocessed_dataset = open(conf.project_path + 'data\dataset_preprocessed.pickle', 'wb') pickle.dump(dataset, preprocessed_dataset) dataset = pickle.load(open(conf.project_path + 'data\dataset_preprocessed.pickle', 'rb')) dataset = [x for x in dataset if len(x[0].split()) > 0] dataset = list(set(dataset)) classifiers = [(SVC(kernel='rbf', C=2.9, gamma=1), 'svm_rbf'), (SVC(kernel='linear'), 'svm_linear')] # (KNeighborsClassifier(), 'knn'), # (MultinomialNB(), 'naive_bayes'),
def preprocess_lexicon(language): """Preprocess lexicon.""" config_dict = get_config_dict(config, language) preprocessor = Preprocessor(**config_dict) info_for_generation = preprocessor.preprocess_lexicon() return info_for_generation
import argparse import sys from src.preprocessor import Preprocessor def parse_args(): '''Parse and return command line arguments.''' parser = argparse.ArgumentParser(description='Mindustry Logic Preprocessor') parser.add_argument('--infile', '-f', nargs='?', type=argparse.FileType('r'), help='Path to input file', default=sys.stdin) parser.add_argument('--outfile', '-o', nargs='?', type=argparse.FileType('w'), help='Path to output file', default=sys.stdout) return parser.parse_args() if __name__ == '__main__': cli = parse_args() with cli.infile as input: processor = Preprocessor(input.read()) try: processor.process_labels() except Exception as exc: sys.stderr.write(str(exc)) with cli.outfile as output: output.write(processor.get_result())
from src.preprocessor import Preprocessor if __name__ == '__main__': preprocessor = Preprocessor() preprocessor.run(metadata_filename='./data/metadata.feather') preprocessor.save(tfidf_weight_filename='./data/tfidf.json', idf_filename='./data/idf.json')
from sklearn.svm import SVC from sklearn.metrics import confusion_matrix from src.preprocessor import Preprocessor from random import sample import numpy as np import src.conf as conf import pickle test = open(conf.project_path + '/data/SemEval2016-task4-test.subtask-BD.txt', 'r').readlines() gold = open( conf.project_path + '/data/SemEval2016_task4_subtaskB_test_gold.txt', 'r').readlines() p = Preprocessor() preprocess = False if preprocess: dataset = [p.preprocess(t.split('\t')[3].replace('\n', '')) for t in test] pickle.dump( dataset, open(conf.project_path + 'data/test_preprocessed.pickle', 'wb')) else: dataset = pickle.load( open(conf.project_path + 'data/test_preprocessed.pickle', 'rb')) labels = [t.split('\t')[2].replace('\n', '') for t in gold] test_dataset = [(example, label) for example, label in zip(dataset, labels)]
def main(): if len(sys.argv) < 2: Helper.print("Too few arguments given. Use -help to get help.") return if len(sys.argv) > 3: Helper.print("Too many arguments given. Use -help to get help.") return input_path_string = sys.argv[1] if input_path_string == "-help": Helper.print("Following arguments are required:") Helper.print("[0] absolute path to source folder") Helper.print("[1] absolute path to output folder") Helper.print("Example: \"C:\\temp\\data\\input\" \"C:\\temp\\data\\output\"") return output_path_string = sys.argv[2] Helper.blockPrint() # declare folder paths root_dir = os.path.abspath(os.sep) root = os.path.join(root_dir, "temp", "TabExImg") FileHelper.createPathIfNotExisting(root) input_path = input_path_string # multiple scanned PDFs FileHelper.createPathIfNotExisting(input_path) pdf_images_path = os.path.join(root, "01_pdf_images") # multiple scanned PDFs FileHelper.createPathIfNotExisting(pdf_images_path) preprocessed_images_path = os.path.join(root, "02_preprocessed_images") # folder per pdf | preprocessed images FileHelper.createPathIfNotExisting(preprocessed_images_path) treated_pdfs_path = os.path.join(root, "03_treated_pdfs") FileHelper.createPathIfNotExisting(treated_pdfs_path) output_path = output_path_string FileHelper.createPathIfNotExisting(output_path) output_boundaries_path = os.path.join(output_path, "excel") FileHelper.createPathIfNotExisting(output_boundaries_path) output_pdf_path = os.path.join(output_path, "pdf") FileHelper.createPathIfNotExisting(output_pdf_path) # delete eventually still existing old files (01_pdf_images, 02_preprocessed_images, 03_treated_pdfs) Helper.print("Precautionary delete files of previous runs...") FileHelper.deleteAllFilesInFolder(pdf_images_path) FileHelper.deleteAllFilesInFolder(preprocessed_images_path) FileHelper.deleteAllFilesInFolder(treated_pdfs_path) # convert PDFs to images pdfConverter = ConvertPdf(input_path, pdf_images_path) pdfConverter.convertPdfs() # preprocess image files preprocessor = Preprocessor(input_path, pdf_images_path, preprocessed_images_path, treated_pdfs_path) preprocessor.execute() # move original PDFs to backup folder FileHelper.moveFiles(input_path, treated_pdfs_path) # detect table boundaries detection = Detection(preprocessed_images_path, output_path, output_boundaries_path, output_pdf_path) detection.detectTableBoundaries() # combine files Helper.print("Start Opitcal Character Recognition...") Helper.print("This can take some time...") ocrConverter = OcrConverter() ocrConverter.convertAllImagesToPdfs(pdf_images_path, output_pdf_path) ocrConverter.combinePdfs(output_pdf_path) Helper.print("Opitcal Character Recognition done...") Helper.print("Start cleanup temporary files...") # delete old files (01_pdf_images, 02_preprocessed_images, 03_treated_pdfs) FileHelper.deleteAllFilesInFolder(pdf_images_path) FileHelper.deleteAllFilesInFolder(preprocessed_images_path) FileHelper.deleteAllFilesInFolder(treated_pdfs_path) Helper.print("Cleanup done...") Helper.print("Table Detection Done") Helper.print("Result Files in " + output_path)