def TestMultinomialNB(class_name_list, test_file_path, vocabulary_dict, prior_dict, cp_dict): positive = 0 negative = 0 for class_name in class_name_list: pos = 0 neg = 0 class_index = class_name_list.index(class_name) folder_path = join(test_file_path, class_name) doc_list = PreProcessing.getFileNames(folder_path) for doc in doc_list: filepath = join(folder_path, doc) filtered_word_list = PreProcessing.readData(filepath) index = applyMultinomialNB(vocabulary_dict, prior_dict, cp_dict, filtered_word_list) if (index == class_index): pos += 1 else: neg += 1 print( "Class: {0}, Total docs:{1} => Positive:{2}, Negative:{3}".format( class_name, len(doc_list), pos, neg)) positive = positive + pos negative = negative + neg return (positive, negative)
def GenerateVocabularyData(stemming, vocabulary_path, train_data_path): with open(train_data_path) as tdf: train_data = json.load(tdf) vocabulary_dict = {} unigram_set = set() bigram_set = set() index = 0 TotalPositiveWordCount = 0 TotalNegativeWordCount = 0 for key in train_data: for value in train_data.get(key): pp_value = pp.StopWordAndSpecialCharRemoval(value, stemming) if stemming == True: bigrams = [ b for b in zip( pp.perform_stemming(re.split('\s+', pp_value)[:-1]), pp.perform_stemming((re.split('\s+', pp_value)[1:]))) ] unigrams = [ps.stem(word) for word in pp_value] else: bigrams = [ b for b in zip( re.split('\s+', pp_value)[:-1], re.split('\s+', pp_value)[1:]) ] unigrams = [u for u in re.split('\s+', pp_value)] temp_bigram_set = set() for bigram in bigrams: temp_bigram_set.add(bigram) bigram_set = bigram_set | temp_bigram_set #to append sets for unigram in unigrams: unigram_set.add(unigram) if key == 'positive': TotalPositiveWordCount = len(set(unigrams)) + len( temp_bigram_set) + TotalPositiveWordCount else: TotalNegativeWordCount = len(set(unigrams)) + len( temp_bigram_set) + TotalNegativeWordCount vocabulary_dict.update({'unigram': list(unigram_set)}) vocabulary_dict.update({'bigram': list(bigram_set)}) if os.path.exists(vocabulary_path): os.remove(vocabulary_path) with open(vocabulary_path, 'w') as outfile: json.dump(vocabulary_dict, outfile, sort_keys=True, indent=4) return { 'PositiveWordCount': TotalPositiveWordCount, 'NegativeWordCount': TotalNegativeWordCount }
def build_handler(): try: #setNumOfBins() global numOfIntervals toCheck = e2.get() if toCheck == "": showinfo( "Naive Bayes Classifier", "Please insert an integer for the Discretization bins attribute" ) return numOfIntervals = int(toCheck) except: showinfo("Naive Bayes Classifier", "Discretization bins must be an integer") return if numOfIntervals < 2: showinfo("Naive Bayes Classifier", "Discretization bins must be at least 2") return if os.stat(pathToStructure).st_size == 0: showinfo("Naive Bayes Classifier", "The file Structure.txt is empty. Please load valid files") return structure_file = open(pathToStructure, "r") try: dfTrain = pd.read_csv(pathToTrain) except Exception as e: if e.__str__() == "No columns to parse from file": showinfo("Naive Bayes Classifier", "The file train.csv is empty. Please load valid files") else: showinfo("Naive Bayes Classifier", "The file train.csv has errors. Please load valid files") totalNumOfRecords_train = dfTrain.shape[0] # num of records if numOfIntervals > totalNumOfRecords_train: showinfo( "Naive Bayes Classifier", "Discretization bins must not be grater than the number of train set records" ) return global dfTrainFinal dfTrainFinal = pp.preProcess(structure_file, dfTrain, numOfIntervals) structure_file = open(pathToStructure, "r") attribute_values_dict = pp.set_attribute_values_dict(structure_file) cl.prepareModel(dfTrainFinal, pathToStructure, numOfIntervals, attribute_values_dict) classify_Button.config(state='normal') showinfo("Naive Bayes Classifier", "Building classifier using train-set is done!")
def documentFrequency(data, terms): df = [] documents = [documents.document for documents in data] for term in terms: dfWeight = 0 for document in documents: document = pre.split(pre.stemming(pre.filtering(pre.tokenization(document)))) if term in document: dfWeight += 1 df.append(dfWeight) return df
def rawTermWeighting(data, terms): rawWeight = [] documents = [documents.document for documents in data] for document in documents: documentWeight = [] document = pre.split(pre.stemming(pre.filtering(pre.tokenization(document)))) for term in terms: documentWeight.append(document.count(term)) rawWeight.append(documentWeight) return rawWeight
def preprocess(): word="code" person="wg_" # TODO: set picture to the same size regardless of its length # tList, xList, yList, zList = pre.readFileGenByAcc("./hotwords_rsc/"+person+word+".csv") tList, xList, yList, zList=pre.readFile("./S9one100five.tsv") tList, freqList, xList, yList, zList = pre.standardize(tList, xList, yList, zList, highpass=5 / 1000) length=len(tList) # fig,ax=plt.subplots(3,1) # ax[0].plot(freqList[0:round(length / 2)], xList[0:round(length / 2)], color='red') # ax[1].plot(freqList[0:round(length / 2)], yList[0:round(length / 2)], color='green') plt.plot(freqList[0:round(length/2)],np.abs(zList[0:round(length/2)]),color='blue') plt.show()
def extract(): word = "code" person = "ty_" prefix="F:/2020AccelEve/database/fixed_rate/S9/" filename="S9one" testFreq=100 testCase=1 postfix=".tsv" for i in range(4): testCase=1 for j in range(5): STR=prefix+filename+(str)(testFreq)+'_'+(str)(testCase)+postfix # TODO: set picture to the same size regardless of its length tList, xList, yList, zList=pre.readFile(STR) # tList, xList, yList, zList = pre.readFileGenByAcc("./hotwords_rsc/" + person + word + ".csv") # =============time domain================== t=[] t.append(getMean(zList)) t.append(getStdDev(zList)) t.append(getKurtosis(zList)) t.append(getSkewness(zList)) t.append(getAveDev(zList)) t.append(getRMS(zList)) t=np.array(t) # =================freq domain================== f=[] tList, freqList, xList, yList, zList = pre.standardize(tList, xList, yList, zList, highpass=10 / 1000) length=len(zList) zList=zList[0:round(length/2)] freqList=freqList[0:round(length/2)] f.append(getSpecStdDev(np.abs(zList),freqList)) f.append(getSpecCentroid(np.abs(zList),freqList)) f.append(getSpecSkewness(np.abs(zList),freqList)) f.append(getSpecKurt(np.abs(zList),freqList)) f.append(getSpecCrest(np.abs(zList),freqList)) f.append("Nothing") f=np.array(f) dic={'FreqDomain':f,'TimeDomain':t} DF=pd.DataFrame(data=dic) savedName=prefix+"FeatureVector/"+filename+(str)(testFreq)+'_'+(str)(testCase)+'.csv' DF.to_csv(path_or_buf=savedName) # DF.to_excel(excel_writer='./test.xlsx') # print(DF) testCase += 1 testFreq += 100
def GetDigits(img): img = PreProcessing.Binarization(img) cv2.imshow('', img) rows,cols = img.shape[:] cols_coordinates = [] number='' flag = True for j in range(0,cols): temp_sum = 0 for i in range(0,rows): temp_sum += img[i][j] if temp_sum == 255*rows and flag: cols_coordinates.append(j+5) flag = False if temp_sum<255*rows: flag = True print(cols_coordinates) if len(cols_coordinates)>0: x = cols_coordinates[0] for i in range(1,len(cols_coordinates)): w = cols_coordinates[i] crop_img = img[:,x:w] x = w crop_img = FindBoundary(crop_img) crop_img = cv2.resize(crop_img, (20,20)) temp_crop_img = [] for i in range(28): temp = [] for j in range(28): temp.append(0) temp_crop_img.append(temp) for i in range(20): for j in range(20): if crop_img[i][j]>127: crop_img[i][j] = 255 else: crop_img[i][j] = 0 if crop_img[i][j]==0: temp_crop_img[4+i][4+j] = 255 cv2.imshow('Boundary', np.float32(temp_crop_img)) cv2.waitKey(0) pred_digit=get_Prediction(temp_crop_img) count=0 digit=str(pred_digit[0]) for i in range(20): for j in range(20): if temp_crop_img[4+i][4+j] == 255 : count=count+1 if count>=(200): digit="1" print(digit,count) #Appending each digit to form a number number=number + digit return number
def single_model(df, model_type, target_col, cont_feat, cat_feat, refit): ''' Runs a grid search of a single type of model. Inputs: df: a Pandas dataframe model_type (str): the type of model to be run target_col (str): the name of the target column cont_feat (list): list of continuous features cat_feat (list): list of categorical features refit (str or False): how the best model should be refit For decision tree refit can be one Returns: best_model: model object of the best model dataframe of feature importances ''' train, test = PreProcessing.tt_split( df[[target_col] + cont_feat + cat_feat], 30) normalize_cont = True if model_type == "RandomForest" or model_type == "DecisionTree": normalize_cont = False train_X, train_Y, test_X, test_Y, labels = pre_processing( target_col, train, test, cont_feat, cat_feat, normalize_cont) grid = build_model(train_X, train_Y, refit, model_type) best_model = eval_model(grid, test_X, test_Y, model_type) fixed_val_threshold(best_model, test_X, test_Y) feature_headers = list(labels) feature_headers.remove(target_col) return best_model, pd.DataFrame( index=feature_headers, data=best_model.feature_importances_).sort_values(by=0, ascending=False)
def try_four_models(df, target_col, cont_feat, cat_feat, refit): """Copied from log_model to call big_grid_search instead of build_log_model""" train, test = PreProcessing.tt_split( df[[target_col] + cont_feat + cat_feat], 30) train_X, train_Y, test_X, test_Y, labels = pre_processing( target_col, train, test, cont_feat, cat_feat) big_grid_search(train_X, train_Y, test_X, test_Y, refit=refit)
def pre_processing(self): if self.input_tests(self.entryClusterNumber.get(), self.entryRunsNumber.get(), self.file_path): self.preProcessing = PreProcessing(self.df).clean() message_box.showinfo("K Means Clustering", "Preprocessing completed successfully!") self.cluster_button.config(state=NORMAL)
def cal_tenengrad(img): if len(img.shape) == 3: (img, _, _) = cv.split(img) temp = pp.image_to_matrix(img) temp_sobel = filters.sobel(temp) source = np.sum(temp_sobel ** 2) metric = np.sqrt(source) return metric
def binaryTermWeighting(data, terms): binaryWeight = [] documents = [documents.document for documents in data] for document in documents: documentWeight = [] document = pre.split(pre.stemming(pre.filtering(pre.tokenization(document)))) for term in terms: if term in document: documentWeight.append(1) else: documentWeight.append(0) binaryWeight.append(documentWeight) return binaryWeight
def main(): pre_processed_data, pre_processed_data_matrix = PreProcessing.pre_process( KGRAM_RANGE, TOKEN_TYPE) processed_data = Mining.KGramClusteringExperiment( pre_processed_data, pre_processed_data_matrix) return
def clusters_in_two_dim_no_url(): data, dataMatrix, features = PreProcessing.pre_process_content_only_no_url( (2, 2), file_name='IRAhandle_tweets_all.csv') print('PreProcessing Done') #Mining.SimpleKGram(data,dataMatrix,4) data = Mining.project_to_two_dimensions(data, dataMatrix) print('Mining Done') PostAnalysis.plot_2D(data, '2dPlotSimpleClusteringCR22.png') data.to_csv('simpleClusteringK20WithCords.csv')
def generateWordCount(self, train_data, stemming): word_and_word_count = {} document = '' for td in train_data: document = document + ' ' + td document = pp.StopWordAndSpecialCharRemoval(document, stemming) word_array = numpy.array(document.split()) unique, counts = numpy.unique(word_array, return_counts=True) return dict(zip(unique, counts))
def GenSet(times, iteration): print("分组:" + str(times)) coal_prefix = 'D:\\coal-gangue\\selected\\coal\\' gangue_prefix = 'D:\\coal-gangue\\selected\\gangue\\' coal_num = 184 gangue_num = 182 suffix = '.jpg' train = [] label = [] testset = [] for i in range(coal_num): path = coal_prefix + str(i) + suffix img = cv2.imread(path, cv2.IMREAD_GRAYSCALE) img = pp.prep(img) # 这里使用了区域关系重采样做插值 #img = cv2.resize(img, (150, 150), interpolation=cv2.INTER_AREA) if i % iteration != times: train.append(fe.Rotation_invariant_LBP(img)) label.append("coal") else: t0 = time.time() tmp = [i, fe.Rotation_invariant_LBP(img), "coal"] t1 = time.time() tmp.append(t1 - t0) testset.append(tmp) for i in range(gangue_num): path = gangue_prefix + str(i) + suffix img = cv2.imread(path, cv2.IMREAD_GRAYSCALE) img = pp.prep(img) # 这里使用了区域关系重采样做插值 #img = cv2.resize(img, (150, 150), interpolation=cv2.INTER_AREA) if i % iteration != times: train.append(fe.Rotation_invariant_LBP(img)) label.append("gangue") else: t0 = time.time() tmp = [i, fe.Rotation_invariant_LBP(img), "gangue"] t1 = time.time() tmp.append(t1 - t0) testset.append(tmp) print(str(times) + "分组完成!") return np.asarray(train), np.asarray(label), testset
def pipeline(img, isVideo=False): # Image Preprocessing undst, binary, binary_warped = PreProcessing.preprocess_image(img) # Lane Detection Code Start lanes, leftx, lefty, rightx, righty, ploty = LaneFinding.get_lane_lines( binary_warped, isVideo) lcurve, rcurve = Support.get_real_lanes_curvature(ploty, leftx, lefty, rightx, righty) output = draw_lane_area(undst, binary_warped, ploty, leftx, lefty, rightx, righty, isVideo) left_fit, right_fit, dummy = Support.fit_polylines(binary_warped.shape[0], leftx, lefty, rightx, righty, x_scale_factor=1, y_scale_factor=1) left_fitx, right_fitx = Support.get_polylines_points( ploty, left_fit, right_fit) if (isVideo is True): lcurve, rcurve = getSmoothedCurveData(lcurve, rcurve) left_fitx, right_fitx = getSmoothedLanesData(left_fitx, right_fitx) shiftFromLaneCenter_m, side = calculate_shift_from_lane_center( binary_warped, left_fitx, right_fitx) Font = cv2.FONT_HERSHEY_SIMPLEX color = (255, 255, 255) cv2.putText(output, 'curve = ' + str((lcurve + rcurve) / 2) + ' m', (10, 100), Font, 1, color, 2, cv2.LINE_AA) cv2.putText( output, 'Vehicle is ' + str(shiftFromLaneCenter_m) + ' (m) ' + side + ' of lane center', (10, 150), Font, 1, color, 2, cv2.LINE_AA) # Lane Detection Code End # Vehicle Detection Code Start cars_boxs = get_classified_cars_boxs(undst) classified_boxs = Visualisation.draw_boxes(undst, cars_boxs, color=(0, 0, 255), thick=6) filtered_boxs, heat_map = get_heat_map_boxs(cars_boxs, undst, isVideo) output = Visualisation.draw_boxes(output, filtered_boxs, color=(0, 0, 255), thick=6) # Vehicle Detection Code End return undst, classified_boxs, heat_map, output
def cutImage(image_bin, nPixel, space, verticalCut: bool = False): ''' Evaluates where the image can be cut Parameters ---------- img_bin: array of binarized image pixels nPixel : integer value of the threshold of black pixels below which it is possible to make a cut space: integer value of the threshold of consecutive white pixels not to be cut verticalCut : boolean value to rotate the image to evaluate vertical cuts (default = True) (optional) Returns info: triplet array [nWhitePixel,pStart, pEnd], nWhitePixel indicates the number of consecutive white pixels, pStart indicates the first white pixel, pEnd indicates the last white pixel ''' if verticalCut: image_bin = image_bin.T #Counting black pixels per row (axis=0: col, axis=1:row) counts, _ = pp.projection(image_bin) #cut contains all lines that have less than nPixel pixels cut = [] for i in range(counts.shape[0]): if (counts[i] < nPixel): cut.append(i) x = 0 h = 0 info = [] flag = False for j in range(len(cut) - 1): if cut[j + 1] - cut[j] == 1: if flag == False: h = cut[j] flag = True x = x + 1 else: info.append([x, h, cut[j]]) flag = False x = 0 info.append([x, h, cut[j]]) delete = [] for k in range(len(info)): if info[k][0] < space: delete.append(k) for m in range(len(delete) - 1, -1, -1): info.remove(info[delete[m]]) #print('[[nWhitePixel,pStart, pEnd]]:',info) if verticalCut: image_bin = image_bin.T ut.cv.imwrite('verticalCut.tif', image_bin) else: ut.cv.imwrite('horizontalCut.tif', image_bin) return info
def logTermWeighting(data, terms): logWeight = [] documents = [documents.document for documents in data] for document in documents: documentWeight = [] document = pre.split(pre.stemming(pre.filtering(pre.tokenization(document)))) for term in terms: count = document.count(term) if count > 0 : documentWeight.append(1 + math.log10(count)) else : documentWeight.append(0) logWeight.append(documentWeight) return logWeight
def single_model(df, model_type, target_col, cont_feat, cat_feat, refit): """For decision tree refit can be one""" train, test = PreProcessing.tt_split( df[[target_col] + cont_feat + cat_feat], 30) train_X, train_Y, test_X, test_Y, labels = pre_processing( target_col, train, test, cont_feat, cat_feat) grid = build_model(train_X, train_Y, refit, model_type) best_model = eval_model(grid, test_X, test_Y, model_type) fixed_val_threshold(best_model, test_X, test_Y) return best_model, labels
def training_model(): x_train, x_val, x_test, y_train, y_val, y_test = PP.PreProcess(path_smiles) model = model.fit(x_train, y_train, epochs=5, batch_size=64, verbose=1, validation_data=(x_val, y_val)) model.save('back_end/Model/servier.h5') return
def processTrainingData(class_name_list, train_file_path): dict = {} doc_dict = {} count = 0 for class_name in class_name_list: word_count_dict = {} count += 1 folder_path = join(train_file_path, class_name) filenames = PreProcessing.getFileNames(folder_path) for f in filenames: filepath = join(folder_path, f) filtered_word_list = PreProcessing.readData(filepath) word_count_dict = addToDictionary(filtered_word_list, word_count_dict) dict[count] = word_count_dict doc_dict[count] = len(filenames) print("Class: {0}, Total docs:{1}".format(class_name, doc_dict[count])) return (dict, doc_dict)
def cal_smd2(img): if len(img.shape) == 3: (img, _, _) = cv.split(img) metric = 0.0 x, y = img.shape img = pp.image_to_matrix(img) for i in range(x-1): for j in range(y-1): metric += abs(int(img[i+1, j]) - int(img[i, j])) * abs(int(img[i, j]) - int(img[i+1, j])) return metric/1000
def cal_vollath(img): if len(img.shape) == 3: (img, _, _) = cv.split(img) img = pp.image_to_matrix(img) x, y = img.shape source = 0 for i in range(x-1): for j in range(y): source += img[i, j] * img[i + 1, j] metric = source - x * y * np.mean(img) return metric
def cal_energy_gradient(img): if len(img.shape) == 3: (img, _, _) = cv.split(img) metric = 0.0 x, y = img.shape img = pp.image_to_matrix(img) for i in range(x-1): for j in range(y-1): metric += (int(img[i+1, j]) - int(img[i, j]))**2 * (int(img[i, j]) - int(img[i+1, j]))**2 return metric/100000
def remove(img): rows, cols = img.shape[:2] BinarizedImage = PreProcessing.Binarization(img) upper_height = 0 for i in range(rows): temp_sum = 0 for j in range(cols): temp_sum += BinarizedImage[i, j] if temp_sum < int(0.15 * 255 * cols): upper_height = i + 5 else: break lower_height = 0 for i in range(rows): temp_sum = 0 for j in range(cols): temp_sum += BinarizedImage[rows - 1 - i, j] if temp_sum < int(0.15 * 255 * cols): lower_height = i + 5 else: break left_width = 0 for j in range(cols): temp_sum = 0 for i in range(rows): temp_sum += BinarizedImage[i, j] if temp_sum < int(0.03 * 255 * rows): left_width = j + 5 else: break right_width = 0 for j in range(cols): temp_sum = 0 for i in range(rows): temp_sum += BinarizedImage[i, cols - 1 - j] if temp_sum < int(0.03 * 255 * rows): right_width = j + 5 else: break x = left_width y = upper_height h = rows - 1 - lower_height w = cols - 1 - right_width return img[y:h, x:w]
def applyPreProc(): """ Desc : Apply Preprocessing """ print('\n ********* Preprocessing **********') #fileName='/nobackup/anikgaik/search/features/Train_Features/Train_Features_Modified.csv' #writeFile='/nobackup/anikgaik/search/features/Train_Features/Train_Features_Replacing_Missing.csv' fileName='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Selected_Features.csv' writeFile1='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Train_Features_Modified.csv' writeFile2='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Final_Train_Features.csv' oPP=PreProc.preprocessing() oPP.removeFeature(fileName,writeFile1) oPP.featureDiscretization(writeFile1,writeFile2)
def log_model(df, target_col, cont_feat, cat_feat, refit): ''' wrapper function for all of it ''' #cont_feat.extend(bin_names) #bin_names comes from data prep train, test = PreProcessing.tt_split( df[[target_col] + cont_feat + cat_feat], 30) pare_df(df, cont_feat, cat_feat, target_col) train_X, train_Y, test_X, test_Y = pre_processing(target_col, train, test, cont_feat, cat_feat) grid = build_log_model(train_X, train_Y, refit=refit) best_log = eval_log_model(grid, test_X, test_Y) fixed_val_threshold(best_log, test_X, test_Y) return None
def applyPreProc(): """ Desc : Apply Preprocessing """ print('\n ********* Preprocessing **********') #fileName='/nobackup/anikgaik/search/features/Train_Features/Train_Features_Modified.csv' #writeFile='/nobackup/anikgaik/search/features/Train_Features/Train_Features_Replacing_Missing.csv' fileName = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Selected_Features.csv' writeFile1 = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Train_Features_Modified.csv' writeFile2 = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Final_Train_Features.csv' oPP = PreProc.preprocessing() oPP.removeFeature(fileName, writeFile1) oPP.featureDiscretization(writeFile1, writeFile2)
def main(): global tList, xList, yList, zList tList, xList, yList, zList = pre.readFile('siri_digits/siri_one_up_bass1.tsv') tList, freqList, xList, yList, zList = \ pre.standardize(tList, xList, yList, zList, highpass=85/ 1000) xList, yList, zList = pre.reverseFFT(xList, yList, zList) # pre.showMap(tList, xList, yList, zList, '85hz filter') # zSmooth, tSmooth = seg.smooth(zList, tList) cuttingpoints=[1196, 2021, 3909, 4711, 6614, 7425, 9275, 10085, 11947, 12760, 14604, 15440] cnt=0 length=len(cuttingpoints) for i in range(0,length,2): x_one=xList[cuttingpoints[i]:cuttingpoints[i+1]] y_one=yList[cuttingpoints[i]:cuttingpoints[i+1]] z_one=zList[cuttingpoints[i]:cuttingpoints[i+1]] specX,specY,specZ=generateMap(x_one,y_one,z_one) generateRGB(specX, specY, specZ,'siri_digits/siribassRGB'+str(cnt)+'two.png') cnt+=1
def tokenizeToken(word): result = [] #do some preprocessing (tries to filter out variable names and stuff like that) preProcessing = PreProcessing.preProcess(word) #for every term that survived preprocessing, first try to split on possible seperators #then split on UCLC boundaries (camel case) and if that doesn't produce known words, #then split them um further with the greedy algorithm for t1 in preProcessing: firstStep = splitOnSeparators(t1) for t2 in firstStep: result += splitOnUCLC(t2) # for t3 in secondStep: # result += refineUnknown(t3) return result
parser.add_argument('inputFile', help="The input text file, usually a normal text file encoded in utf-8") parser.add_argument('outputFile', help="The parsed output CoNLL file encoded in utf-8.") parser.add_argument('--model', help="Indicate the model to be used if it is not the default model") parser.add_argument('--tagged', action="store_true", help="Indicate to the parser that your data is already in CoNLL format in parsing mode.") parser.add_argument('turboOpt', nargs="*", help="Additional options to pass to TurboParser (Without the preceding hyphens: '--evaluate' becomes 'evaluate')") return parser if __name__ == "__main__": parser = argumentSetup().parse_args() taggedFile = parser.inputFile # parsing sequence if not parser.tagged: taggedFile = os.path.join(TEMP, "GSW_tagged"+os.path.basename(parser.inputFile)) PreProcessing.main(parser.inputFile, taggedFile) # assign features FeatureConfig().run(taggedFile) model = DEFAULT_MODEL if parser.model: model = parser.model # call Turbo Parser args = ["--test", "--file_model={}".format(model), "--file_test={}".format(taggedFile), "--file_prediction={}".format(parser.outputFile)] + map(lambda x: "--"+x, parser.turboOpt) print "Called TurboParser with options: "+" ".join(args) subprocess.call([TURBOP]+ args)
DOWN_TRAIN_TOKENIZED Validation: UP_VALIDATE_RAW DOWN_VALIDATE_RAW UP_VALIDATE_SENTENCES DOWN_VALIDATE_SENTENCES UP_VALIDATE_TOKENIZED DOWN_VALIDATE_TOKENIZED Test: Email number is array index + 1 TEST_RAW TEST_SENTENCES TEST_TOKENIZED ''' UP_TRAIN_RAW,DOWN_TRAIN_RAW,UP_TRAIN_SENTENCES,DOWN_TRAIN_SENTENCES,UP_TRAIN_TOKENIZED,DOWN_TRAIN_TOKENIZED,UP_VALIDATE_RAW,DOWN_VALIDATE_RAW,UP_VALIDATE_SENTENCES,DOWN_VALIDATE_SENTENCES,UP_VALIDATE_TOKENIZED,DOWN_VALIDATE_TOKENIZED,TEST_RAW,TEST_SENTENCES,TEST_TOKENIZED=PreProcessing.process() UP_TRAIN_TOTAL_TOKENIZED = UP_TRAIN_TOKENIZED + UP_VALIDATE_TOKENIZED DOWN_TRAIN_TOTAL_TOKENIZED = DOWN_TRAIN_TOKENIZED + DOWN_VALIDATE_TOKENIZED VALIDATE_LABELS_UP=[1] * len(UP_VALIDATE_RAW) VALIDATE_LABELS_DOWN=[0] * len(DOWN_VALIDATE_RAW) print('\n\n========== Unsmoothed N-Grams==========\n\n') print('-----===== UP_TRAIN =====-----\n\n') upTrainUnigram,upTrainBigram,upTrainTrigram = NGram.getNGram(UP_TRAIN_TOKENIZED) print('\n\n-----===== DOWN_TRAIN =====-----\n\n') downTrainUnigram,downTrainBigram,downTrainTrigram = NGram.getNGram(DOWN_TRAIN_TOKENIZED)
def process(self, image): image = pr.gamma_correction(0.2,image) image = pr.dog_filter(image) image = pr.histogram_equalize(image) return image