def makeSplit(corpusElements, fraction=0.5): documentIds = corpusElements.documentsById.keys() sample = Split.getSample(len(documentIds), fraction) division = {} for i in range(len(documentIds)): division[documentIds[i]] = sample[i] return division
def makeSplit(corpusElements, fraction=0.5): documentIds = corpusElements.documentsById.keys() sample = Split.getSample(len(documentIds),fraction) division = {} for i in range(len(documentIds)): division[documentIds[i]] = sample[i] return division
def generate_category_choice(possible): """Generates all distinct category splits. possible: Possible values for the category. If there are n categories, there are 2^(n-1)-1 distinct possible splits. All the splits are generated and returned in a list. Uses the binary form of number from 1 to 2^(n-1)-1 to generate the splits.""" n = len(possible) splits = [] for i in range(1, pow(2, n-1)): split = Split(is_numerical=False) for j in xrange(n): if (i >> j) % 2 == 1: split.add_category_range( possible[j] ) splits.append( split ) return splits
def create_library(img, path_t, lib_dir, w, Ovr, f, aug): fn = get_name(img) ### Build subfolders if os.path.isdir(lib_dir) is False: print('\nCreating training library folders at: ' + lib_dir) os.makedirs(lib_dir) os.makedirs(lib_dir + '/pics') os.makedirs(lib_dir + '/masks') pics_dir = lib_dir + '/pics' masks_dir = lib_dir + '/masks' ### Split mosic into tiles print('Splitting image: %s...' % fn) with suppress_stdout(): ### suppress the long output Split.split_image(input=img, output_dir=pics_dir, patch_w=w, patch_h=w, adj_overlay_x=Ovr, adj_overlay_y=Ovr, out_format=f) os.remove('split_image_info.txt') truths = gpd.read_file(path_t) crs = truths.crs # print('\nCascading truths for analysis...') truths = gpd.GeoSeries(cascaded_union(truths['geometry'])) truths = gpd.GeoDataFrame(geometry=truths, crs=crs) ### Remove bad tiles from library (usually edge tiles) & Re-number Filter.remove(pics_dir, truths) ### Create ground truth masks for tiles. Remove non-overlapping tiles Masks.create_masks(path_t, lib_dir, pics_dir, masks_dir, img) ### Augment tile-mask pairs to grow library Augment.augment_images(lib_dir, aug) print('\nLibrary build successful\n\n') return
def generate_numerical_splits( records, index ): """Generates and fills all the possible numerical splits with the records, with respect to their feature in the given index. Returns the list of possible splits """ possible = {} for r in records: possible[ r.features[index] ] = True possible = possible.keys() splits = [] for i in xrange(0, len(possible)-1): s = Split(is_numerical=True) s.set_numerical_range( possible[i] ) s.place( records, index ) splits.append( s ) return splits
def AddPoint(self, x): print self.name, "adding point", x if self.latestPoint == None: self.latestPoint = x else: # create a new split between the last click and this one # (getting the order right) if (x > self.latestPoint): self.splits.append( Split.Split(self.motion, self.latestPoint, x)) else: self.splits.append( Split.Split(self.motion, x, self.latestPoint)) # add the split to the total motion represented by the track # If this is the first split, we need to replace the masked motion # with the current split, otherwise add the new split to the # previous ones #if self.motion == self.maskedMotion : # self.motion = self.splits[-1].GetMotion() #else: # self.motion = Piavca.MotionAdder(self.splits[-1].GetMotion(), self.motion) self.latestPoint = None
def LoadSplits(self): filename = "%s.txt" % self.GetName() print filename file = open(filename, "r") lines = file.readlines() for line in lines: print line splits = string.split(line, ",") print splits self.splits = [] for split in splits: if (split == ""): continue split = string.split(split) if split == []: continue print split split = Split.Split(self.motion, float(split[0]), float(split[1])) self.splits.append(split) for s in self.splits: print s
def main(path): A = splitString('data/' + path + '/PersonA.txt', 's') A1 = splitString('data/' + path + '/PersonA.txt', 'd') B = splitString('data/' + path + '/PersonB.txt', 's') B1 = splitString('data/' + path + '/PersonB.txt', 's') final = A[0] + B[0] + A1[0] + B1[0] final = [int(x) for x in final] # Now our array is order final.sort() file = open('data/' + path + '/Time.txt', 'r') for line in file.readlines(): time = line.rstrip().split('¦') # using rstrip to r time = [0 if x == '' else x for x in time] time = [int(x) for x in time] ### doing something ############################ time = splitTime(final, time) #this retusn time in conversation splittedA = splitText(A[1], final) # returns splittes conversation splittedB = splitText(B[1], final) # returns splittes conversation split_index = Split.splitter_function('data/' + path) index = splitInterface( splittedA, split_index[0]) # return index where iterface is changing return (time[0:index], split_index[1], time[index:len(time)], splittedA[0:index], splittedA[index:len(splittedA)], splittedB[0:index], splittedB[index:len(splittedA)])
import Split import ColorNeural import cPickle as pickle from sklearn.preprocessing import MinMaxScaler np.random.seed(488) #def normalize(imgs, std_reg=1e-5): # return (imgs - imgs.mean(axis=0, keepdims=True)) / (imgs.std(axis=0, keepdims=True) + std_reg) scaler=MinMaxScaler() mypath='/home/it-lab412/Desktop/combined' onlyfiles = colorFilesArray random_seq = np.random.permutation(72390) onlyfiles = onlyfiles[random_seq] files_splits = Split.split_seq(onlyfiles,30) y1 = colorYArray.astype(np.int32) y1 = y1[random_seq] y_splits = Split.split_seq(y1,30) net = ColorNeural.NN() for i in range(30): X = np.ndarray(shape=(2413,3,256,256),dtype='float32') print "Batch Number = ",i files = files_splits[i] Y = y_splits[i] for n in range(0,2413):
h_length = 256 sr = 22050 h_duration = h_length / sr start_time = 0 steps = 0 thresholds = [] threshold = 0.0 max_threshold = 1.51 while (threshold<max_threshold): thresholds.append(threshold) threshold += 0.01 units = [1, 2, 3, 5, 7, 10] epsilon = np.finfo(float).eps for split_id in range(len(units)): test_set, training_set = sp.split_units(split_id) #training_set.pop(training_set.index(10)) print("Testing on unit " + str(test_set[0]).zfill(2)) threshold_Fs = Parallel(n_jobs=-1)(delayed(gf.get_F)(training_set, th) for th in thresholds) # find best threshold (argmax) and print it to slurm best_threshold_id = np.argmax(threshold_Fs) best_threshold = thresholds[best_threshold_id] best_threshold_str = "{0:.2f}".format(best_threshold) best_F = np.max(threshold_Fs) best_F_str = "{0:.2f}".format(100*best_F) print("Best F measure for training: " + best_F_str + "%") print("Best threshold for training: " + best_threshold_str)
sys.path.insert(0, "../") import aiml import Split import logging logging.basicConfig() # The Kernel object is the public interface to # the AIML interpreter. k = aiml.Kernel() # Use the 'learn' method to load the contents # of an AIML file into the Kernel. k.learn("cn-startup.xml") # Use the 'respond' method to compute the response # to a user's input string. respond() returns # the interpreter's response, which in this case # we ignore. k.respond("load aiml cn") # Loop forever, reading user input from the command # line and printing responses. while True: text = raw_input("> ") text = Split.splitChinese(text) logging.info(text) print k.respond(text)
def main(): # --------------------- import data --------------------- glass = dp.GlassImport() num_inputs = glass.shape[1] - 2 # split into training, validation & testing data train, validate, test = sp.split(glass, num_inputs, t=0.7, v=0.15) # --------------------- specify network architecture --------------------- num_neurons1 = 10 # layer 1 num_neurons2 = 2 # layer 2 alpha = 0.01 # learning rate epoch = 100 # number of iterations nlayer1 = nl.NeuronLayer(num_neurons1, num_inputs, nl.tansig, nl.j_tansig) # instantiate layer 1 nlayer2 = nl.NeuronLayer(num_neurons2, num_neurons1, nl.softmax, nl.j_softmax) # instantiate layer 2 np.random.seed(0) # randomly initialize weight and bias on the interval [-0.5, 0.5] W1 = np.matrix(np.random.rand(num_neurons1, num_inputs) - 0.5) b1 = np.matrix(np.random.rand(num_neurons1, 1) - 0.5) W2 = np.matrix(np.random.rand(num_neurons2, num_neurons1) - 0.5) b2 = np.matrix(np.random.rand(num_neurons2, 1) - 0.5) # initialize cross entropy loss (training & validation) ce_t = [] ce_v = [] global s1_all, s2_all # pass on randomly initialized weights and biases to the network nlayer1.setWeightBias(w=W1, b=b1) nlayer2.setWeightBias(w=W2, b=b2) # ---------------------------------------------------------------------------------------------------------------------- # training the network # ---------------------------------------------------------------------------------------------------------------------- for j in range(epoch): for i in range(len(train)): # create input matrix from training dataset input = np.matrix(train.iloc[i, slice(0, num_inputs)]).transpose() # --------------------- propagate the inputs forward --------------------- nlayer1.FP(input) nlayer2.FP(nlayer1.a) # --------------------- calculate errors --------------------- target = np.matrix(train.iloc[i, -2:]) e = nl.cross_entropy(nlayer2.a, target) if i == 0: e_all = e else: e_all = np.concatenate((e_all, e), axis=1) # --------------------- backpropagate sensitivities --------------------- s2 = nl.senseo(t=target, a=nlayer2.f(nlayer2.n)) # layer 2 sensitivity if i == 0: s2_all = s2 else: s2_all = np.concatenate((s2_all, s2), axis=1) s1 = nl.senseh(F_prime=nlayer1.j(nlayer1.a), W=W2, s=s2) # layer 1 sensitivity if i == 0: s1_all = s1 else: s1_all = np.concatenate((s1_all, s1), axis=1) # --------------------- cross-entropy loss --------------------- ce_t.append(e_all.mean()) # --------------------- update weights and biases --------------------- nlayer2.update(sensitivity=s2_all.mean(axis=1), learning_rate=alpha) # layer 2 update nlayer1.update(sensitivity=s1_all.mean(axis=1), learning_rate=alpha) # layer 1 update # ---------------------------------------------------------------------------------------------------------------------- # Validating the network # ---------------------------------------------------------------------------------------------------------------------- input = np.matrix(validate.iloc[:, slice(0, num_inputs)]).transpose() # --------------------- propagate inputs forward --------------------- nlayer1.FP(input) nlayer2.FP(nlayer1.a) target = np.matrix(validate.iloc[:, -2:]) # --------------------- compute errors --------------------- for i in range(len(target)): e = nl.cross_entropy(nlayer2.a[:, i], target[i]) if i == 0: e_all = e else: e_all = np.concatenate((e_all, e), axis=1) ce_v.append(e_all.mean()) # --------------------- Early Stopping Condition --------------------- if j == 0: val_fail = [] elif ce_v[j] > ce_v[j - 1]: val_fail.append(1) if len(val_fail) == 5: print 'Validation error has increased for 5 consecutive epochs. Early stopping at epoch {}'.format( j) break else: val_fail = [] # ---------------------------------------------------------------------------------------------------------------------- # Test and evaluate the network # ---------------------------------------------------------------------------------------------------------------------- # --------------------- confusion matrix --------------------- actual = pd.Series(test.iloc[:, -2], name='Actual') # actual values (targets) input = np.matrix( test.iloc[:, slice(0, num_inputs)]).transpose() # network inputs, p nlayer1.FP(input) # layer 1 net-input nlayer2.FP(nlayer1.a) # layer 2 net-input predict = nl.classify(nlayer2.a) # predicted values from network predict = np.array(predict).flatten() predict = pd.Series(predict, name='Predicted') confusion = pd.crosstab(actual, predict, margins=False) # create confusion matrix confusion = confusion.astype(float) # convert values to floats print confusion # output confusion matrix to the console # --------------------- accuracy metrics --------------------- ERR = (confusion.loc[0, 1] + confusion.loc[1, 0]) / len(predict) ACC = 1 - ERR FPR = confusion.iloc[0, 1] / (confusion.iloc[0, 1] + confusion.iloc[0, 0]) TPR = confusion.iloc[1, 1] / (confusion.iloc[1, 0] + confusion.iloc[1, 1]) print 'Accuracy: %.2f' % ACC print 'Error: %.2f' % ERR print 'False Positive Rate: %.2f' % FPR print 'True Positive Rate: %.2f' % TPR # --------------------- plot confusion matrix --------------------- fig, ax = plt.subplots(figsize=(5, 5)) ax.matshow(confusion, cmap=plt.cm.Blues, alpha=0.3) for i in range(confusion.shape[0]): for j in range(confusion.shape[1]): ax.text(x=j, y=i, s=confusion.iloc[i, j].astype(int), va='center', ha='center') plt.xlabel('Predicted Class') plt.ylabel('True Class') plt.title('Confusion Matrix of Test Set Predictions') # --------------------- plot log(cross-entropy loss) --------------------- fig, ax = plt.subplots(ncols=1, nrows=1, figsize=[8, 8]) ax.plot((np.arange(0, len(ce_t))), np.log(ce_t), label='Training', linewidth=2.0, color='blue') ax.plot((np.arange(0, len(ce_v))), np.log(ce_v), label='Validation', linewidth=2.0, color='green') ax.set_ylabel('Log Cross Entropy Loss') ax.set_xlabel('Epochs') ax.legend() plt.grid() plt.show()
def makeFolds(ids, folds=10): sample = Split.getFolds(len(ids), folds) division = {} for i in range(len(ids)): division[ids[i]] = sample[i] return division
for text in re.split(_nsre, s) ] mypath = '/home/it-lab412/Desktop/All512' onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] onlyfiles.sort(key=natural_sort_key) onlyfiles = np.array(onlyfiles) onlyfiles = onlyfiles[2:] #X = np.ndarray(shape=(5018,1,512,512),dtype='float32') ### Shifted inside the loop #random_seq = np.random.permutation(175630) random_seq = seqArray random_files = onlyfiles[random_seq] random_splits = Split.split_seq(random_files, 65) y = pd.read_csv('trainLabels.csv') y = y['level'] y1 = y y2 = y for i in range(4): y1 = y1.append(y2) y1 = y1.values y1 = y1.astype(np.int32) y1 = y1[random_seq]
def getDocumentFolds(documentIds, folds): sample = Split.getFolds(len(documentIds),folds) division = {} for i in range(len(documentIds)): division[documentIds[i]] = sample[i] return division
def split(path): head, tail = Split(path) if len(head) > 1 and head[-1] == '/': head = head[:-1] return (head, tail)
def makeDivision(ids, fraction=0.5, seed=0): sample = Split.getSample(len(ids),fraction, seed) division = {} for i in range(len(ids)): division[ids[i]] = sample[i] return division
def makeFolds(ids, folds=10): sample = Split.getFolds(len(ids),folds) division = {} for i in range(len(ids)): division[ids[i]] = sample[i] return division
def natural_sort_key(s): return [int(text) if text.isdigit() else text.lower() for text in re.split(_nsre, s)] mypath='/home/it-lab412/Desktop/All512' onlyfiles = [ f for f in listdir(mypath) if isfile(join(mypath,f))] onlyfiles.sort(key=natural_sort_key) onlyfiles = np.array(onlyfiles) onlyfiles = onlyfiles[2:] #X = np.ndarray(shape=(5018,1,512,512),dtype='float32') ### Shifted inside the loop #random_seq = np.random.permutation(175630) random_seq = seqArray random_files = onlyfiles[random_seq] random_splits = Split.split_seq(random_files,65) y = pd.read_csv('trainLabels.csv') y = y['level'] y1 = y y2 = y for i in range(4): y1 = y1.append(y2) y1 = y1.values y1 = y1.astype(np.int32)
outputTrees = [] for i in range(options.folds): newRoot = ET.Element("corpus") for key in corpusElements.rootElement.attrib.keys(): newRoot.attrib[key] = corpusElements.rootElement.attrib[key] outputTrees.append(newRoot) print >> sys.stderr, "Reading document ids" documentIds = [] for document in corpusElements.documents: docId = document.attrib["id"] assert( not docId in documentIds ) documentIds.append(docId) print >> sys.stderr, "Calculating document division" sample = Split.getFolds(len(documentIds),options.folds) division = {} for i in range(len(documentIds)): division[documentIds[i]] = sample[i] print >> sys.stderr, "Dividing documents" for document in corpusElements.documents: docId = document.attrib["id"] outputTrees[division[docId]].append(document) for i in range(options.folds): if options.output == None: filename = options.input + ".fold" + str(i) else: filename = os.path.join(options.output, os.path.basename(options.input) + ".fold" + str(i)) print >> sys.stderr, "Writing file", filename
import sys sys.path.insert(0, "../") import aiml import Split import logging logging.basicConfig() # The Kernel object is the public interface to # the AIML interpreter. k = aiml.Kernel() # Use the 'learn' method to load the contents # of an AIML file into the Kernel. k.learn("cn-startup.xml") # Use the 'respond' method to compute the response # to a user's input string. respond() returns # the interpreter's response, which in this case # we ignore. k.respond("load aiml cn") # Loop forever, reading user input from the command # line and printing responses. while True: text = raw_input("> ") text = Split.splitChinese(text) logging.info(text) print k.respond(text)
def do_your_thang(img_dir, out_path, path_t, saved_model, w, Ovr, f, timeline): truths = gpd.read_file(path_t) crs = truths.crs # print('\nCascading truths for analysis...') truths = gpd.GeoSeries(cascaded_union(truths['geometry'])) truths = gpd.GeoDataFrame(geometry=truths, crs=crs) total = get_number(img_dir, '*tif') count = 1 for pic in glob.glob(img_dir + '/*.tif'): if timeline: print('########## \ Timeline Image: %s / %s \ ##########' % (count, total)) fn = get_name(pic) out_dir = out_path + '/' + fn ### Build subfolders if os.path.isdir(out_dir) is False: os.makedirs(out_dir) os.makedirs(out_dir + '/tiles') os.makedirs(out_dir + '/predictions') os.makedirs(out_dir + '/map') os.makedirs(out_dir + '/metrics') tiles_dir = out_dir + '/tiles' pred_dir = out_dir + '/predictions' map_dir = out_dir + '/map' met_dir = out_dir + '/metrics' ### Split mosic into tiles print('Splitting image: %s...' % fn) with suppress_stdout(): ### suppress the long output Split.split_image(input=pic, output_dir=tiles_dir, patch_w=w, patch_h=w, adj_overlay_x=Ovr, adj_overlay_y=Ovr, out_format=f) os.remove('split_image_info.txt') ### Remove tiles that don't intersect ground truths & Re-number Filter.remove(tiles_dir, truths, overlap_only=True) ### convert to .JPEG Convert.to_jpg(tiles_dir) ### create & save predictions UNet_Predict.deploy_model(saved_model, tiles_dir, pred_dir) ### create map from prediction tiles if Map.build_map(tiles_dir, pred_dir, map_dir, fn, truths): ### calculate performance metrics (and save True Posities for timeline) Metrics.run_metrics(truths, map_dir, pic, fn, met_dir, timeline) count += 1 top_folder = out_path + '/Maps' ### create topfolder to consolidate prediction/timeline output if os.path.isdir(top_folder) is False: os.makedirs(top_folder) ### copy output maps to top folder for file in glob.glob(out_path + '/**/map/*cascaded_map*'): shutil.copy(file, top_folder) return
def run_single_fold_train_test(df, phys_target, run_params, pre, curr_fold_num): """ Train, predict, and calculate eval metrics, for model. on a single data fold. This function receives the data, takes care of splitting it to folds, trains the model and returns the results for the fold (index) it ran on. :param df: dataframe or list of dataframes. all columns are those that will be used for training :param phys_target: series with the physical model of the target data (for eval purposes) :param run_params: instance of type TestInstanceParams class, holds relevant model configurations :param pre: instance of type Process - for data preprocessing :param curr_fold_num: The index of the relevant fold number. has to be an int between (and including) 0 and run_params.k -1. :return: a dictionary with the model, the predicitons and ground truth for the test, validation and train datasets, and for the validation and test also the physical model predictions and a dataframe summarizing the evaluation metrics for the fold, for the train, validation and test sets """ fold_dict = {} fold_dict["fold_num"] = curr_fold_num train, val, test, phys_val, phys_test = Split.kfold_split_train_test(df, curr_fold_num, k=run_params.k, phys_target=phys_target) pre.fit(*get_feature_and_target_data( train, run_params.target_col, run_params.is_target_in_input)) fold_dict["preprocess"] = pre X_train, y_train, dates_y_train = pre.transform( *get_feature_and_target_data(train, run_params.target_col, run_params.is_target_in_input)) X_val, y_val, dates_y_val = pre.transform( *get_feature_and_target_data(val, run_params.target_col, run_params.is_target_in_input)) X_test, y_test, dates_y_test = pre.transform( *get_feature_and_target_data(test, run_params.target_col, run_params.is_target_in_input)) input_dim = X_train.shape[2] model_structure_args = {"look_back": run_params.train_steps, "input_dimension": input_dim, "build_config_description": run_params.desc_str + "_f{}".format(curr_fold_num)} fold_dict["train"] = {} fold_dict["val"] = {} fold_dict["test"] = {} fold_dict["train"]["dates"] = dates_y_train fold_dict["val"]["dates"] = dates_y_val fold_dict["test"]["dates"] = dates_y_test with tf.device("/cpu:0"): curr_model = run_params.model_class(**model_structure_args) with tf.device("/cpu:0"): # train model (and save it, if this was implemented in model class) curr_model = curr_model.fit(X_train, y_train, val_data=(X_val, y_val), **run_params.model_args) fold_dict["model"] = curr_model fold_dict["test"]["pred"] = pre.inverse_scale_target(fold_dict["model"].predict(X_test)) fold_dict["test"]["true"] = pre.inverse_scale_target(y_test.reshape(-1, 1)) fold_dict["test"]["ww3"] = phys_test.iloc[run_params.train_steps + run_params.pred_forward:].values.reshape(-1, 1) fold_dict["val"]["pred"] = pre.inverse_scale_target(fold_dict["model"].predict(X_val)) fold_dict["val"]["true"] = pre.inverse_scale_target(y_val.reshape(-1, 1)) fold_dict["val"]["ww3"] = phys_val.iloc[run_params.train_steps + run_params.pred_forward:].values.reshape(-1, 1) fold_dict["train"]["pred"] = pre.inverse_scale_target(fold_dict["model"].predict(X_train)) fold_dict["train"]["true"] = pre.inverse_scale_target(y_train.reshape(-1, 1)) fold_dict["results_test"] = Eval.eval_pred_phys_const(fold_dict["test"], pre) fold_dict["results_val"] = Eval.eval_pred_phys_const(fold_dict["val"], pre) # for train we don't look at ww3 model or const guess. these metrics are interesting # only for checking overfit in training train_eval = Eval.eval_model( fold_dict["train"]["true"], fold_dict["train"]["pred"]) fold_dict["results_train"] = pd.Series(train_eval, name="ML") return fold_dict
def makeDivision(ids, fraction=0.5, seed=0): sample = Split.getSample(len(ids), fraction, seed) division = {} for i in range(len(ids)): division[ids[i]] = sample[i] return division
import Split as sp import Parser as prs import mathExpr as ME text = ''' X + y =100*65489-fsdahfj+76 x = c+4u while (c>c( qb = fds+fgds if (c>c) qb = fds+fgds Q = x+y+n+5*9*y*u/u+i ''' lists = sp.Split(text) Res = prs.Parse(lists) for i in range(len(lists)): print(lists[i],Res[i]) # text = 'x+y+n+5*9*y*u/u+i' # lis = sp.Split(text) # print (lis) # print(ME.math_expr(lis[0]))
outputTrees = [] for i in range(options.folds): newRoot = ET.Element("corpus") for key in corpusElements.rootElement.attrib.keys(): newRoot.attrib[key] = corpusElements.rootElement.attrib[key] outputTrees.append(newRoot) print >> sys.stderr, "Reading document ids" documentIds = [] for document in corpusElements.documents: docId = document.attrib["id"] assert (not docId in documentIds) documentIds.append(docId) print >> sys.stderr, "Calculating document division" sample = Split.getFolds(len(documentIds), options.folds) division = {} for i in range(len(documentIds)): division[documentIds[i]] = sample[i] print >> sys.stderr, "Dividing documents" for document in corpusElements.documents: docId = document.attrib["id"] outputTrees[division[docId]].append(document) for i in range(options.folds): if options.output == None: filename = options.input + ".fold" + str(i) else: filename = os.path.join( options.output,
import PyPDF2 import Split from subprocess import call import sys if (len(sys.argv) < 2): print("Error\nFormat: \n\tpython main.py your-pdf-file") else: filename = sys.argv[1] directory = "splitted/" + filename Split.split(directory, filename) pdfFileObj = open(filename, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) for i in range(pdfReader.numPages): splitted_file_name = directory + "/" + repr(i) call(["pdftotext", splitted_file_name + ".pdf"]) # f = open(splitted_file_name + '.txt', 'r') # print("Page %s" % repr(i+1)) # print(f.read()) # print("====================")
def _split_bins(self): split_clf=Split(feature=self._temp,min_sample=self.min_sample,max_node_number=self._bins) split_clf.fit(self._df,self._label) self.bins=split_clf.bins