def recognize_image(image_name): mr.remove_meme_faces(base_path, test_items_path, image_name) pp.pre_process(base_path, pre_process_path, image_name) ls.line_segment(base_path, test_items_path, pre_processed_path, image_name) ws.word_segment(base_path, line_segments_path, image_name) cs.character_segment(base_path, words_segments_path, image_name) predict_sentence = rc.predict_sentence(base_path, character_segment_path, image_name) result = sc.spell_correct(predict_sentence) return result
def __init__(self, train_path, test_path): # Preprocessed train and test self.processed_train = pre_process(train_path) self.processed_test = pre_process(test_path) # Sentence array without unknown self.train_array_no_unk = sentence_array(self.processed_train) # Sentence array without occ -3 train, test = get_sentences(self.processed_train, self.processed_test) self.train_array = train self.test_array = test
def preprocessing(input): metabolites = int(input.readline()) reactions = int(input.readline()) print('Metabolites: {0}\nReactions: {1}\n'.format(metabolites, reactions)) reversibles = [int(x) for x in input.readline().split()] stoichio = [] for line in input: stoichio.append([float(x) for x in line.split()]) pre_process(stoichio, reversibles)
def main(): # Properties code = 62 # Prepare patients into array patients = prepare.prepare_patients_() # Loops through patients for i in range(len(patients)): patient = patients[i] pre_process.pre_process(patients, patient, code)
def run(self, args_s): args_d = json.loads(args_s) iname = args_d['panid'] self.socketIO.emit('update', { 'id': iname, "phase": 1, 'val': -1, 'max': -1 }) self.socketIO.wait(seconds=1) print("{0} start pre".format(args_d['local_id'])) pre_process.pre_process( namedtuple('Struct', args_d.keys())(*args_d.values()))
def pollution_change(pollution, source, year, option='Mean'): df1 = pre_process(pollution, source, year, option) df2 = pre_process(pollution, source, '2016', option) df1[source+' '+option] = df1[source+' '+option].astype(float) df2[source+' '+option] = df2[source+' '+option].astype(float) df = df1 df[source+' '+option] = df2[source+' '+option] - df1[source+' '+option] df[source+' '+option] = df[source+' '+option].astype(str) df["text"] = df["state"] + '<br>' +\ source + ' '+option+' '+df[source+' '+option] #scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\ #[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']] data = [ dict( type='choropleth', #colorscale = scl, autocolorscale = True, locations = df.index, z = df[source+' '+option].astype(float), locationmode = 'USA-states', text = df['text'], marker = dict( line = dict ( color = 'rgb(255,255,255)', width = 2 ) ), colorbar = dict( title = pollution.loc[0, source+' Units']) ) ] layout = dict( title = year+' - 2016 US '+source+' level change by state<br>(Hover for details)', geo = dict( scope='usa', projection=dict( type='albers usa' ), showlakes = True, lakecolor = 'rgb(255, 255, 255)'), ) fig = dict( data=data, layout=layout ) py.iplot( fig, filename='us-pollution-change-map' ) plotSuccessful = "Pollution change map plotted." return fig, plotSuccessful
def predict(train_on_gpu, net, test_review, vocab_to_int, sequence_length=200): # prints out whether a given a review is predicted to be positive or negative in sentiment using a trained model # parameters include net: a trained network, test_review = review made of normal text and punctuations, sequence_length = the padded length of a review # pre-process and tokenize the review words, reviews_split = pre_process(test_review) test_ints = [] test_ints.append([vocab_to_int[word] for word in words]) # test sequence padding features = pad_features(test_ints, sequence_length) # convert to tensor feature_tensor = torch.from_numpy(features) net.eval() batch_size = feature_tensor.size(0) # initialize hidden state h = net.init_hidden(batch_size, train_on_gpu) if (train_on_gpu): feature_tensor = feature_tensor.cuda() # get predicted output output, h = net(feature_tensor, h) # convert output probability to predicted class (0 or 1) pred = torch.round(output.squeeze()) if (pred.item() == 1): print("Positive review detected!") else: print("Negative review detected.")
def prediction(): if request.method == 'GET': argument_dict={} dict_tmp = request.args.to_dict() for key in dict_tmp.keys(): argument_dict = ast.literal_eval(key) order = argument_dict['orderTitle'] desc = argument_dict['description'] text_string = order+" "+desc pre_processed_str = pre_process(text_string) df_dict = predict_lstm.main(order, desc) l1= list(df_dict.keys()) l2 = list(df_dict.values()) res ={} for key in l1: for value in l2: res[key]=value l2.remove(value) break result=[] for key, value in res.items(): t={} t['Prediction']=key t['Confidence']= value result.append(t) return json.dumps(result)
def transforms(img_dir, img_name, pre=False): """transforms the img. Args: img_dir (str): dir. of image. img_name (str): file name of image. pre (bool): toggle pre-process Returns: tensor: image after transforms. """ if pre: cache_path = os.path.join(img_dir, 'cache', img_name + 'pre_1.png') if os.path.isfile(cache_path): data = Image.open(cache_path).convert('L') else: data = pre_process(os.path.join(img_dir, img_name), remove_curve=True) data.save(cache_path) else: data = Image.open(os.path.join(img_dir, img_name)).convert('L') transforms = T.Compose([ T.Resize((128, 128)), T.ToTensor(), ]) data = transforms(data) return data
def pollution_map(df, source, year, option='Mean'): # Pre-processes the pollution data so that it can be plotted by plotly. df2 = pre_process(df, source, year, option) #scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\ #[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']] data = [ dict( type='choropleth', #colorscale = scl, autocolorscale=True, locations=df2.index, z=df2[source + ' ' + option].astype(float), locationmode='USA-states', text=df2['text'], marker=dict(line=dict(color='rgb(255,255,255)', width=2)), colorbar=dict(title=df.loc[0, source + ' Units'])) ] layout = dict( title=year + ' US ' + source + ' level by state<br>(Hover for details)', geo=dict(scope='usa', projection=dict(type='albers usa'), showlakes=True, lakecolor='rgb(255, 255, 255)'), ) fig = dict(data=data, layout=layout) py.iplot(fig, filename='us-pollution-map') plotSuccessful = "Pollution map plotted." return fig, plotSuccessful
def main(): compilable = sys.argv[1] preProcessed = pre_process.pre_process(compilable) parsed = function_parse.function_parse(preProcessed) formatable = order_of_computation.order_span(parsed) result = var_parse.var_format(formatable) print(result)
def __init__(self, n, using_weighted_distance, weight, p): self.n = n self.words = {} self.points = list() self.pre_processor = pre_process.pre_process() self.weighted_distance = using_weighted_distance self.weight = weight self.p = p
def SFM(X, y): # 从模型中选择,根据重要性,类似逐个选择,后向选择,逐渐抛弃不重要的 X_train, X_test, y_train, y_test = pre_process(X, y) clf = SVM_recommend() m_range = [2000 - 50 * i for i in range(36, 40)] for m in m_range: selector = SelectFromModel( clf, threshold=-np.inf, max_features=m) # 只根据max_features确定选择的数量,不设定threshold X_ = selector.fit_transform(np.asarray(X), np.asarray(y)) X_train, X_test, y_train, y_test = pre_process(X_, y, bReset=True) clf = SVM_recommend_run(B_SFM, X_train, X_test, y_train, y_test, paras={'max-features': m})
def ocr_core(filename): """ This function will handle the core OCR processing of images. """ # pytesseract.pytesseract.tesseract_cmd = '/app/.apt/usr/bin/tesseract' text = pytesseract.image_to_string( pre_process(cv2.imread(filename)) ) # We'll use Pillow's Image class to open the image and pytesseract to detect the string in the image return text
def evaluateScore(X, y): X_train, X_test, y_train, y_test = pre_process(X, y, bReset=True) clf = SVC(C=0.01, max_iter=2000, kernel='linear', probability=True) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_classes = set(y) y_pred = label_binarize(y_pred, range(1, len(y_classes) + 1)) y_test = label_binarize(y_test, range(1, len(y_classes) + 1)) auc = metrics.roc_auc_score(y_test, y_pred, average='micro') return auc
def UF(X, y): # 计算feature间的相关性,进行选择,这里做的是forward k_range = [50 * i for i in range(1, 4)] for k in k_range: selector = SelectKBest(chi2, k=k) X_ = selector.fit_transform(X, y) X_train, X_test, y_train, y_test = pre_process(X_, y, bReset=True) SVM_recommend_run(F_UF, X_train, X_test, y_train, y_test, paras={'k-best': k})
def SVM_base(X, y): # model_name为采用的降维方法,X为降维后的feature数据 X_train, X_test, y_train, y_test = pre_process(X, y) for d in DECI_FUNCS: for k in KERNELS: for C in CS: SVM_recommend_run(COMPARE, X_train, X_test, y_train, y_test, paras={}, C=C, kernel=k, decision_function_shape=d) '''
def VT(X, y): # 利用方差进行选拔, 这里是backward for var in [0.03 * i for i in range(1, 50)]: selector = VarianceThreshold(threshold=var) X_ = selector.fit_transform(X) # X.shape[1] # selected feature amount X_train, X_test, y_train, y_test = pre_process(X_, y, bReset=True) # 有问题 SVM_recommend_run(B_VT, X_train, X_test, y_train, y_test, paras={ 'variance': var, 'feature-num': X_.shape[1] })
def get_csv(): # 预处理 imgs, img_names = pre_process() canny_imgs = [] # 手写canny边缘检测 # new_gray = guass_smooth(gray_img) # dx, dy, M, theta = gradient(new_gray) # nms = NMS(M, dx, dy) # dt = double_threhold(nms) # dt[dt == 1] = 255 # plt.imshow(dt) # plt.show() # canny算法边缘检测 """cv2的canny算法""" for img in imgs: img = cv2.GaussianBlur(img, (3, 3), 0) canny = cv2.Canny(img, 50, 150) canny_imgs.append(canny.ravel()) # print(canny_imgs[0]) """存储带(无)标签样本:通过二值化 & canny边缘检测""" label_df = pd.read_excel('after_process\character.xlsx', sheet_name='Sheet1') # 标签名,文件名 13128唯一样本,2013个汉字 img_df = pd.DataFrame({ 'file_name': img_names, 'img': canny_imgs }) # 图像,文件名 13462(含重复)样本 img_df = img_df.drop_duplicates('file_name', keep='first') # 删除img_df文件名重复项 13430个唯一样本 # 让数组全显示,由于csv写入的时_str_,不全显示则会写入省略号 np.set_printoptions(threshold=sys.maxsize) # 按照文件名合并,得到每个图象对应标签 label_img_df = pd.merge(label_df, img_df, on='file_name') # 图像,文件名,标签名 13120个带标签唯一样本 label_img_df.to_csv(r'after_process\label_character.csv') # 再从img_df中获取剩下的无标签样本 name_list = label_img_df['file_name'].tolist() unlabel_img_df = img_df[~img_df['file_name'].isin(name_list)] # 图像,文件名 310个无标签唯一样本 unlabel_img_df.to_csv(r'after_process\unlabel_character.csv') return None
def process_image(image) -> dict: result = dict() result['timestamp'] = int(time.time()) # unix timestamp # Pre-process: Get just the scoreboard portion of the screen image_scaled = pre_process.pre_process(image, (70, 38, 350, 62), RESIZE_FACTOR) # image_scaled.show() clock_text, total_seconds = get_clock(image_scaled) home_score, away_score = get_score(image_scaled) home_name = get_home_team_name(image_scaled) away_name = get_away_team_name(image_scaled) result['clock'] = clock_text result['gametime'] = total_seconds # in seconds result['home_score'] = home_score result['away_score'] = away_score result['home_name'] = home_name result['away_name'] = away_name return result
def generateSummary(sp): try: page_id = sp.page_id page = ParentPage.query.get(page_id) format = page.format if format == "html": content = page.page_content print(content) elif format == "pdf": print(content) elif format == "png": print(content) else: None try: #modifying mycore_parentpage by adding data to columns named page_content, image_link, facts, title page.page_content, page.image_link, page.facts, page.title = pre_process( html_content) except: pass article_text = page.page_content gensimOut = gensimSum(article_text) out = gensimOut if out == "": sp.status = 3 else: preface = custom_summarize(out, 5, 200) out = generate_paragraphs(out) out = preface_output_merger(preface, out) sp.status = 2 sp.data = out except Exception as e: sp.status = 3 db.session.commit()
def evaluate(self, test_x, test_y): # Timekeeping print("Start Evaluating.") start_time = time.time() correct = 0 for x, y in zip(test_x, test_y): max_group = "" max_p = 1 x_words = pre_process(x) for candidate_group in self.posts.keys(): # P(O|H) * P(H) for each candidate group p = math.log(self.p_group[candidate_group]) for word in x_words: if word in self.vocabulary: p += math.log( self.p_word_given_group[candidate_group][word]) if p > max_p or max_p == 1: max_p = p max_group = candidate_group
def train(self, train_x, train_y): """ :param train_x: Words from each document to train on :param train_y: Class the document belongs to :return: """ # Timekeeping print("Start Training.") start_time = time.time() # Connect data and labels together (x -> y) for x, y in zip(train_x, train_y): words = pre_process(x) for word in words: self.posts[y].append(word) self.vocabulary.add(word) # Calculate P(Hj) and P(Wk|Hj) for group in self.posts.keys(): self.p_word_given_group[group] = {} docs_in_group = self.posts[group] self.p_group[group] = len(docs_in_group) / len(train_x) # Count number of words for word in self.vocabulary: self.p_word_given_group[group][word] = 1.0 for word in self.posts[group]: if word in self.vocabulary: self.p_word_given_group[group][word] += 1.0 for word in self.vocabulary: self.p_word_given_group[group][word] /= len( self.posts[group]) + len(self.vocabulary) # Timekeeping timed = int(time.time() - start_time) print("Training finished in ", timed, "seconds.")
def main(path): args = utils.get_args() filename = path + "/Data/" + args.filename #read data xls_data = binarization.read_xls(filename) #drop columns xls_reduce_data = utils.reduce_data(xls_data) #scores_xls, columns_xls = ranking.get_ranking(xls_reduce_data) #create_json.ranking_json(scores_xls, columns_xls, path, "pre_process_ranking.json") process_data = pre_process.pre_process(xls_data, args) process_data.to_csv(path + "/Data/process_data.csv") create_json.create_json_categories(process_data, path) scores_ranking, columns_ranking = ranking.get_ranking(process_data) create_json.ranking_json(scores_ranking, columns_ranking, path, "after_process_ranking.json") columns = list(process_data.columns.values) print("columns", columns) start = time.time() binar_data = pd.DataFrame() for column in columns: print("process column: ", column) binar_column = binarization.process_column(process_data[str(column)], column, args) binar_data = pd.concat([binar_data, binar_column], axis = 1) #binar_data[column] = binar_df end = time.time() print("binarizacion demoro", end-start) print(binar_data) binar_data.to_csv(path + "/Data/binar_data.csv")
if feature not in good_features: selected_features = list(good_features) + [feature] Xts = np.column_stack(X[:, j] for j in selected_features) score = evaluateScore(Xts, y) scores.append((score, feature)) print("Current AUC : ", np.mean(score)) good_features.add(sorted(scores)[-1][1]) score_history.append(sorted(scores)[-1]) print("Current Features : ", sorted(list(good_features))) # Remove last added feature good_features.remove(score_history[-1][1]) good_features = sorted(list(good_features)) print("Selected Features : ", good_features) return good_features def transform(X, y): good_features = selectionLoop(X, y) return X[:, good_features] if __name__ == "__main__": os.chdir('..') X, y = load_data_small() print(X.shape) X_ = transform(X, y) X_train, X_test, y_train, y_test = pre_process(X_, y, bReset=True) SVM_recommend_run(AUC, X_train, X_test, y_train, y_test, {'feature-num': X_.shape[1]})
from pre_process import pre_process, const_df from data_prep import data_const from frame_prep import get_frames from ConvLSTM import ConvLSTM_input, model from performance import performance_metrics import os import tensorflow as tf if __name__ == '__main__': fs = 20 frame_size = fs * 4 hop_size = fs * 2 epochs = 150 os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' preprocessed_list = pre_process() df = const_df(preprocessed_list) scaled_X = data_const(df) X, y = get_frames(scaled_X, frame_size, hop_size) print(X.shape) X_train, X_test, y_train, y_test = ConvLSTM_input(X, y) history, model = model(X_train, X_test, y_train, y_test, epochs) performance_metrics(model, history, epochs, X_test, y_test)
objOpt, thetaOpt, lamOpt, muOpt = sub.solve_subproblem( y, xBar) #(objOpt) upper bound thetaBar.append(thetaOpt) lamBar.append(lamOpt) muBar.append(muOpt) SUBD = np.amin([objOpt, SUBD]) print "THETA" print thetaOpt print "X" print xBar # preprocess: deal with duplicate hyperplanes, and remove all 0 coefficients hyperplanes. g_flag, replicated_marker, coefficients = pre.pre_process( xBar, thetaOpt, lamOpt, muOpt, y) # print the flag and deplicate markers #print "g_flag", g_flag #print "replicated_marker", replicated_marker #print len(coefficients) #for co_index in xrange(len(coefficients)): # print coefficients[co_index] # Get all the unique hyperplanes and save the coefficients of them. linker, unique_coefficients = pre.unique_coeff(g_flag, replicated_marker, coefficients, M, K, N) # Set a threshold as the distance used in the cell enumeration distance = [np.spacing(1)]
#!/usr/bin/python import sys from IPython import embed from connect_blop import download_data from load_data import load_data from pre_process import pre_process from fit_predict_test import fit_predict_test # ---- get data key_string = str(sys.argv[1]) download_data(key_string) df_dic = load_data() # ---- pre process & feature building data = pre_process(df_dic) # ---- fit, predict and cross-validate fit_predict_test(data)
def on_request(self, *args): global mutex1, mutex2, mutex, mutex_data # tf.reset_default_graph() print("got request") data = args[0] filename, ext = splitext(data['input_path']) panid = basename(filename) # download file from upper server print("download...") sshdownload(data) args_d = {} remote_uuid = "{0}{1}".format(uuid.uuid4(), "_deeplearning") socketIO = SocketIO('localhost', ssht2.local_bind_port, LoggingNamespace) args_d['remote_uuid'] = remote_uuid args_d['socketIO'] = socketIO args_d['model'] = "pspnet50_ade20k" args_d['sliding'] = True args_d['flip'] = True args_d['multi_scale'] = True print("phase 1...") args_d['input_path'] = "./{0}{1}".format(panid, ext) args_d['output_path'] = "{2}/{0}{1}".format(panid, ext, config_p1_folder) pre_process.pre_process( namedtuple('Struct', args_d.keys())(*args_d.values())) print("phase 2...") # args_d['sess']=sess # args_d['model_ok']=pspnet args_d['input_path'] = config_p1_folder + '/' args_d['input_path_filter'] = panid args_d['output_path'] = config_p2_folder + '/' del args_d['socketIO'] mutex.acquire() with open("temp_arg.json", 'w+') as fout: fout.write(json.dumps(args_d)) mutex.release() # mutex1.put(args_d,block=True) print("sent task,wait response") while (1): # print("waiting...") mutex.acquire() if not os.path.exists("temp_arg.json"): break mutex.release() time.sleep(1) mutex.release() # mutex2.get(block=True) args_d['socketIO'] = socketIO print("phase 3...") args_d['input_path'] = "./{0}{1}".format(panid, ext) args_d['input_path2'] = "{2}/{0}{1}".format(panid, ext, config_p2_folder) args_d['output_path'] = "{2}/{0}{1}".format(panid, ext, config_p3_folder) class_scores = img_combine2.img_combine2( namedtuple('Struct', args_d.keys())(*args_d.values())) print("blended...") img = misc.imread("./{0}{1}".format(panid, ext)) img = misc.imresize(img, 10) class_image = np.argmax(class_scores, axis=2) pm = np.max(class_scores, axis=2) colored_class_image = utils.color_class_image(class_image, args_d['model']) #colored_class_image is [0.0-1.0] img is [0-255] alpha_blended = 0.5 * colored_class_image + 0.5 * img misc.imsave(filename + "_seg_blended" + ext, alpha_blended) print("upload...") sshupload(data, filename + "_seg_blended" + ext) print("garbage cleaning") print("success") self.emit("next")
# Solve the subproblem objOpt, thetaOpt, lamOpt, muOpt = sub.solve_subproblem(y, xBar) #(objOpt) upper bound thetaBar.append(thetaOpt) lamBar.append(lamOpt) muBar.append(muOpt) SUBD = np.amin([objOpt, SUBD]) print "THETA" print thetaOpt print "X" print xBar # preprocess: deal with duplicate hyperplanes, and remove all 0 coefficients hyperplanes. g_flag, replicated_marker, coefficients = pre.pre_process(xBar, thetaOpt, lamOpt, muOpt, y) # print the flag and deplicate markers #print "g_flag", g_flag #print "replicated_marker", replicated_marker #print len(coefficients) #for co_index in xrange(len(coefficients)): # print coefficients[co_index] # Get all the unique hyperplanes and save the coefficients of them. linker, unique_coefficients = pre.unique_coeff(g_flag, replicated_marker, coefficients, M, K, N) # Set a threshold as the distance used in the cell enumeration distance = [np.spacing(1)] for i in xrange(len(unique_coefficients)): sum = 0.0
def translate(sentence): sentence = pre_process(sentence) decoder = Decoder() sentence = decoder.decode(sentence) sentence = post_process(sentence) return sentence
from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import confusion_matrix from my_confusion_matrix import MyConfusionMatrix #from pandas_ml import ConfusionMatrix # Folder with the various subfolders raw_data_folder = Path("../data/derive") # Foldre with all of the new processed files proc_data_folder = Path("../data/processed") # Check if we've already processed the data, pre-process if not try: _ = next(proc_data_folder.iterdir()) except StopIteration: # No files in preprocessed, so generate them pre_process.pre_process(raw_data_folder, proc_data_folder) # Use scikit learn's count vectorizer to convert text files into X matrix by word frequency corpus = proc_data_folder.iterdir() vectorizer = CountVectorizer(input='filename', token_pattern=r'[a-zA-Z]+-?[a-zA-Z]+') X = vectorizer.fit_transform(corpus) # Generate y by iterating through files and extracting file names. Tested to make sure order is same with X matrix y = np.array([]) for file in proc_data_folder.iterdir(): # labels are stored in the filename after the dash label = int(file.stem.split('-')[1]) # Not most efficient way but we don't expect crazy numbers of files so append should be fine y = np.append(y, [label])
if __name__ == "__main__": online = False if online: from config_online import * else: from config_offline import * print("******************* Preprocess ***********************") start = time.time() train_origin = pre.read_csv(path_train) print('Loading Train_set Done! Time: %.3f s' % (time.time() - start)) print('origin_train_shape = [%s,%s]' % (train_origin.shape)) train_origin = pre.pre_process(train_origin) print('pre_process Train_set Done! Time: %.3f s' % (time.time() - start)) train = pre.feature_engineering(train_origin, True) print('feature_engineering Train_set Done! Time: %.3f s' % (time.time() - start)) print('train_shape = [%s,%s]' % (train.shape)) del train_origin test_origin = pre.read_csv(path_test) print('Loading Test_set Done! Time: %.3f s' % (time.time() - start)) print('origin_test_shape = [%s,%s]' % (test_origin.shape)) test_origin = pre.pre_process(test_origin) print('pre_process Test_set Done! Time: %.3f s' % (time.time() - start))