def plot_accuracies(file_train, features): """ Plots the accuracies of a basic Multinomial Naive Bayes classifier as a function of the number of features in the Vectorizer :param file_train: The file to read the train data from :param features: a list consisting of different number of features """ df_train = pd.read_csv(file_train) # reads the file into df # creates random train-test-split X_train, X_test, y_train, y_test = train_test_split(df_train['sample'], df_train['label'], test_size=0.2) # apply preproccessing X_train = pre.preprocessing(X_train.to_numpy()) X_test = pre.preprocessing(X_test.to_numpy()) acc = np.zeros(len(features)) # loop on the number of features for i in range(len(features)): acc[i] = calculate_accuracies(X_train, X_test, y_train, y_test, features[i]) # plot the results df_1 = DataFrame({"features": features, "acc": acc}) plot = (ggplot(df_1) + geom_line(aes(x="features", y="acc")) + labs(title="accuracies on train test based on num_feat", x="features", y="accuracy")) print(plot)
def main(yf): parser = ParseYaml(yf) to_preprocessing = parser.get_item("to_preprocessing") to_train = parser.get_item("to_train") to_demo = parser.get_item("to_demo") to_plots = parser.get_item("to_plots") preprocessing_config = parser.get_item("preprocessing") if to_preprocessing: preprocessing.preprocessing(preprocessing_config) train_config = parser.get_item("train") if to_train: train.train(train_config) demo_config = parser.get_item("demo") if to_demo: demo.demo(demo_config) plot_config = parser.get_item("plots") if to_plots: plots.line_plot(plot_config.get("line_plot")) # print("---1---") plots.day_scatter_plot(plot_config.get("day_scatter_plot")) plots.multi_day_scatter_plot(plot_config.get("multi_day_scatter_plot")) # print("---2---") plots.hist_plot(plot_config.get("hist_plot")) # print("---3---") plots.day_map_plot(plot_config.get("day_map_plot")) plots.mutli_day_map_plot(plot_config.get("multi_day_map_plot"))
def pipeline(video , len): preprocessing.preprocessing(video , len) os.system("rm -rf Anomaly-Detection/C3D_Features") os.system("sh mk.sh") os.system("sh C3D/C3D-v1.0/examples/c3d_feature_extraction/c3d_sport1m_feature_extraction_video.sh") os.system("rm -rf Anomaly-Detection/C3D_Features_Avg") os.system("sh mk1.sh") os.system("python convert.py") os.system("python run.py") print("Pipeline ended")
def main(args): # Time setting total_start_time = time.time() if args.preprocessing: preprocessing(args) if args.training: training(args) # Time calculate print(f'Done! ; {round((time.time()-total_start_time)/60, 3)}min spend')
def read_test_data(): print('Load testing data:') with open('data/123456789/test-data.csv', 'r') as test_csv: reader = csv.reader(test_csv, delimiter=';') for index, row in enumerate(reader): if index == 0: X_test = np.array([preprocessing(row[0])]) Y_test = np.array([get_index_of_category(row[1])]) else: X_test = np.vstack((X_test, preprocessing(row[0]))) Y_test = np.vstack((Y_test, get_index_of_category(row[1]))) return X_test, Y_test
def main(args): start_time = time.time() if args.preprocessing: preprocessing(args) if args.training: training(args) if args.testing: testing(args) end_time = round((time.time() - start_time) / 60, 4) print(f'Done!; {end_time}min spend')
def predict_status(img_height, img_width, n_images, path_test, preprocess=True): # dimensions of our images. img_width, img_height = img_width, img_height test_data_dir = path_test # if preprocessing needs to be done - preprocessed images are stored in preprocessed folder if preprocess == True: preprocessing(img_width, img_height, test_data_dir) nb_test_samples = n_images # number of images in each batch batch_size = 10 # this is the augmentation configuration we will use for testing: # only rescaling & default preprocessing function for InceptionV3 test_datagen = ImageDataGenerator(rescale=1. / 255, preprocessing_function=preprocess_input) test_generator = test_datagen.flow_from_directory(test_data_dir, target_size=(img_width, img_height), batch_size=batch_size, class_mode='binary', shuffle=False) # load saved model for predictions model = load_model('Best_Model_Checkpoint.hdf5') # generate predictions pred = model.predict_generator(test_generator, steps=nb_test_samples / batch_size) # with 0.5 as threshold get Normal/ abnormal class y_pred = (pred >= 0.5).astype(int) # store predictions along with labels in dataframe predictions = pd.DataFrame() predictions['filename'] = test_generator.filenames predictions['pred'] = y_pred predictions.ix[predictions['pred'] == 1, 'pred'] = 'Normal' predictions.ix[predictions['pred'] == 0, 'pred'] = 'Abnormal' predictions.to_csv('prediction_file_testset.csv', index=False)
def Myprediction(df, features, clf, name, item_category_list_unique): testdf = pd.read_csv('data/test/round1_ijcai_18_test_a_20180301.txt',sep=' ') testdf.context_timestamp += 8*60*60 testdf = preprocessing.preprocessing(testdf) testdf.item_category_list.replace(item_category_list_unique, list(np.arange(len(item_category_list_unique))), inplace=True) listfeature = [ 'item_category_list-user_id_query_day_hour_item_brand_id', 'item_category_list+item_pv_level', 'item_price_level+shop_review_num_level', 'item_price_level+item_category_list', 'item_price_level-item_collected_level', ] for i in listfeature: if '+' in i: x = i.split('+') testdf[i] = add(testdf[x[0]],testdf[x[1]]) elif '-' in i: x = i.split('-') testdf[i] = substract(testdf[x[0]],testdf[x[1]]) elif '*' in i: x = i.split('*') testdf[i] = times(testdf[x[0]],testdf[x[1]]) elif '/' in i: x = i.split('/') testdf[i] = divide(df[x[0]],testdf[x[1]]) prediction_format = pd.read_csv('data/output/0203.txt',sep=' ') train, predict = df, testdf clf.fit(train[features], train.is_trade, eval_set = [(train[features], train.is_trade)], eval_metric='logloss', verbose=True) predict['predicted_score'] = clf.predict_proba(predict[features])[:,1] print(predict[['instance_id', 'predicted_score']]) prediction_file = pd.merge(prediction_format[['instance_id']], predict[['instance_id', 'predicted_score']], on = 'instance_id', how = 'left') prediction_file.to_csv('data/output/{}.txt'.format(name), sep=' ',index = None) return clf
def test_model(): """ test the model on test set """ df_train = pd.read_csv("train_set.csv") df_test = pd.read_csv("test_set.csv") x_train, y_train = df_train['sample'], df_train['label'] x_test, y_test = df_test['sample'], df_test['label'] x_train = pre.preprocessing(x_train.to_numpy()) x_test = pre.preprocessing(x_test.to_numpy()) model = LinearSVC(C=LAMBDA) max_features = int(len(x_train)/2) test_acc, train_acc, p = calculate_accuracies(x_train, x_test, y_train, y_test, model, max_features) print("number of features: ", len(p)) print("test Accuracy: ", test_acc, " train Accuracy: ", train_acc)
def Myprediction(df, features, clf, name, item_category_list_unique): testdf = pd.read_csv('data/test/round1_ijcai_18_test_a_20180301.txt', sep=' ') testdf.context_timestamp += 8 * 60 * 60 testdf = preprocessing(testdf) #convert_time(testdf) testdf.item_category_list.replace( item_category_list_unique, list(np.arange(len(item_category_list_unique))), inplace=True) prediction_format = pd.read_csv('data/output/0203.txt', sep=' ') train, predict = df, testdf clf.fit(train[features], train.is_trade, eval_set=[(train[features], train.is_trade)], eval_metric='logloss', verbose=True) predict['predicted_score'] = clf.predict_proba(predict[features])[:, 1] print(predict[['instance_id', 'predicted_score']]) prediction_file = pd.merge(prediction_format[['instance_id']], predict[['instance_id', 'predicted_score']], on='instance_id', how='left') prediction_file.to_csv('data/output/{}.txt'.format(name), sep=' ', index=None) return clf
def predict(): data = {"success": False} if request.method == "POST": user_input = request.json["text"] preprocessed_input = preprocessing(user_input) array_dtype = get_dtype(preprocessed_input) encoded_input = base64_encoding(preprocessed_input) # MAKE C-CONTIGOUS????? # endoced_input = encoded_input.copy(order="C") ### k = str(uuid.uuid4()) d = {"id": k, "shape": preprocessed_input.shape, "dtype": array_dtype, "data": encoded_input} rdb.rpush(DATA_QUEUE, json.dumps(d)) # dump the preprocessed input as a numpy array while True: output = rdb.get(k) if output is not None: output = output.decode("utf-8") data["predictions"] = json.loads(output) rdb.delete(k) break time.sleep(CLIENT_SLEEP) data["success"] = True return jsonify(data)
def train_doc2vec(texts, filename='doc2vec'): ''' Trains and saves a doc2vec mode on a given data. Args: texts: dict, where key (str) is an id of a text, value (str) is the text itself filename: str ''' preprocessed_texts = [] for id_ in texts: preprocessed_texts.append( TaggedDocument(words=preprocessing(texts[id_], stopwords_), tags=[id_])) print('preprocessing is done') model = Doc2Vec(vector_size=300, min_count=3, alpha=0.25, min_alpha=0.025, epochs=100, workers=4, dm=1) model.build_vocab(preprocessed_texts) print(len(model.wv.vocab)) model.train(preprocessed_texts, total_examples=model.corpus_count, epochs=model.epochs, report_delay=60) model.save(filename + '.model')
def mean_GPP_nn(): preds = mlp_predictions() m = np.mean(preds, axis=0) X, Y, Y_Preles = preprocessing.preprocessing() years = [ 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 ] arr = np.zeros((365, 13)) for i in range(len(years)): x, y, y_nn = preprocessing.split_by_year(X, Y, m, years=[years[i]]) arr[:, i] = y_nn[:365] fig, ax = plt.subplots(figsize=(8, 6), dpi=100) #figsize=(8,6), dpi=100 CI = np.quantile(arr, (0.05, 0.95), axis=1) fig.tight_layout(pad=1.5) ax.fill_between(np.arange(365), CI[0], CI[1], color="lightgreen", alpha=0.5) ax.plot(np.arange(365), np.mean(arr, axis=1), color="green", label="$\widehat{p2}_{m2} - \widehat{p2}_{m1}$", linewidth=1.0) ax.set_ylabel("GPP [g C m$^{-2}$ day$^{-1}$] ") ax.set_xlabel("Day of year") ax.set_ylim(-1, 20)
def load_casual_data(data_path): question_signal = u'Người báo tin:' answer_signal = u'Chatbot:' QA_dict = {} with open(data_path, 'r', encoding='utf-8') as fp: flag = False question = None answer = None for sen in fp: sen = sen.strip() if question_signal in sen: flag = True question = sen.split(question_signal)[1] question = preprocessing(question, tokenize=False) elif answer_signal in sen and flag: flag = False answer = sen.split(answer_signal)[1] # answer = preprocessing(answer).lower() QA_dict.update({question.lower(): answer.strip()}) else: continue return QA_dict
def one_sample(): import pandas as pd import numpy as np from preprocessing import preprocessing data = pd.read_csv("Churn_Modelling.csv") pre = preprocessing(data) scaler = pre[-1] geo = input("Geography: ") score = int(input("Credit Score:")) gender = input("Gender (m/f): ") age = int(input("Age: ")) tenure = int(input("Tenure (year): ")) balance = int(input("Balance: ")) num_pro = int(input("Number of Procuts: ")) credit = input("Does it have credit card (y/n): ") active = input("Is this custmoer active memeber (y/n): ") salary = int(input("Estimated Salary: ")) geo = geo.lower() gender = gender.lower() credit = credit.lower() active = active.lower() if geo == "france": geo = 0 geo1 = 0 elif geo == "spain": geo = 1 geo1 = 0 elif geo == "germany": geo = 0 geo1 = 1 if gender == "f": gender = 0 elif gender == "m": gender = 1 if credit == "y": credit = 1 elif credit == "n": credit = 0 if active == "y": active = 1 elif active == "n": active = 0 arr = np.array([[ float(geo), geo1, score, gender, age, tenure, balance, num_pro, credit, active, salary ]]) arr = scaler.transform(arr) return arr
def run(source, method, k): tags = [1,1,1,1,1,1] if method=='knn' and k is None: raise ValueError('Argument k is mandatory for KnnClassifier') else: if source=='chemistry': if tags[5]: glossary = get_chemistry() else: glossary = None filepath = 'files/chemistry.xml' else: if tags[5]: glossary = get_graphicdesign() else: glossary = None filepath = 'files/graphic-design.xml' matrix, tag_names = preprocessing(filepath, tags, glossary) print(tag_names) if method=='knn': k = int(math.fabs(int(k) or 5)) r = knn(matrix, k) else: r = logistic_regression(matrix) print(r)
def get_corpus(path, meta=True): ''' Collects corpus and its meta from file. Args: path: str meta: bool, if meta is needed Returns: dict (doc_id, text) AND dict (doc_id, dict of meta) (if meta) ''' counter = 0 texts = {} meta = {} for file in os.listdir(path): with open(path + file, 'r', encoding='utf-8') as f: d = json.load(f) if d['text'] is not None: counter += 1 texts[d['id']] = d['text'] meta[d['id']] = d meta[d['id']]['text'] = preprocessing(meta[d['id']]['text']) if meta: return texts, meta return texts
def Analysis(): while True: folder_name = input("分析結果を格納するフォルダ名を入力してください。resultフォルダの中に新しく作成されます") try: os.mkdir("result/{}".format(folder_name)) break except FileExistsError: print("同じ名前のフォルダがすでに存在しています。名前を変えてください") continue tweetdata_name = input("分析するtwitterデータファイルをdataフォルダに入れて、そのファイル名を入力してください(.csvまで)") tweet_path = "data/{}".format(tweetdata_name) df = preprocessing(tweet_path) df = sentiment_predict(df,folder_name) sentiment_plot(df,folder_name) posi_corpus,posi_dictionary,nega_corpus,nega_dictionary = get_topic_num(df) os.mkdir("result/{}/positive_group".format(folder_name)) os.mkdir("result/{}/negative_group".format(folder_name)) plot_topics(posi_corpus,posi_dictionary,nega_corpus,nega_dictionary,folder_name) while True: finish_or_continue = int(input("グループ数を変更して分析し直す場合は[1]を、終了する場合は[2]を押してください")) if finish_or_continue == 1: plot_topics(posi_corpus,posi_dictionary,nega_corpus,nega_dictionary,folder_name) elif finish_or_continue == 2: break else: print("半角で1か2を押してください") continue
def processing(): #Reading the configfile specifications = readConfig() number = specifications[0] filenames = specifications[1] species = specifications[2] horizontalGridLat = specifications[3] horizontalGridLon = specifications[4] verticalGrid = specifications[5] output = specifications[6] filename = specifications[7] airden = specifications[8] airdenfiles = specifications[9] apriori = specifications[10] aks = specifications[11] modeloutput = [] modeloutputFilenames = [] airdenFilenames = [] cache = [] if(airden == "False" or airden == "false" or airden == "0" or airden == "no" or airden == "No"): airden = False else: airden = True aksNew = [] apriori = pr.preprocessing(apriori) for elem in aks: aksNew.append(pr.preprocessing(elem)) for i in range(number): if(airden): print("Processing Air density..") airdenFilenames.append(density.airden(airdenfiles[i], verticalGrid, output)) print("Processing Air density: Done!") print("Processing ASCII-Files: {}/{}".format(i+1,number)) nameOfFile = a2d.ipAscii(filenames[i], species, horizontalGridLat, horizontalGridLon, verticalGrid, output) print("Interpolating the model..") cache = pm.interpolateModel(filename, species, nameOfFile, verticalGrid, output) modeloutputFilenames.append(cache) print("Interpolation: Done!") print("Processing ASCII-Files: Done!") print("Computing total column...") for i in range(number): tc.createTC(filenames[i], apriori, species, aksNew[i], output, modeloutputFilenames[i], airdenFilenames) print("Computing total column: Done!") return
def test_preprocessing(): source = os.getcwd() big_folder = os.path.dirname(source) + '/Raman_Data' with os.scandir(big_folder) as entries: for entry in entries: foldername = entry.name print(foldername) subig_folder = os.path.dirname( source) + '/Raman_Data/' + foldername with os.scandir(subig_folder) as entries: pnumber = 0 for entry in entries: filename = entry.name if filename.endswith(".txt"): pnumber = pnumber + 1 assert pnumber is not 0, 'There is no txt file in the' + subig_folder pp.preprocessing(big_folder)
def run(self): preprocessing(self.axial_images, self.coronal_images, self.sagittal_images) try: self.examine_result_abnormal = Model(key = 'abnormal').get_prediction() self.examine_result_acl = Model(key = 'acl').get_prediction() self.examine_result_men = Model(key = 'men').get_prediction() except Exception: print("can't find models") self.get_data(self.examine_result_abnormal, self.examine_result_acl, self.examine_result_men) self.save_to_database() self.db.close() self.signals.finished.emit() return
def setUpClass(cls): cls.pp = preprocessing('insight_testsuite/test_1/input/complaints.csv') cls.processed_data = cls.pp.input_data with open(cls.pp.ipath, 'r') as file: reader = csv.reader(file, delimiter=',') cls.items = [row for row in reader] cls.data = cls.items[1:] cls.header = cls.items[0]
def execute(): chat = preprocessing() # msg = request.args.get('msg') msg = request.get_json() # return msg # kata = "apakah stok ada" res = chat.predict(inputtxt=msg['msg']) # print(msg['msg']) return jsonify(res)
def filter(all_news_accounts): preprocessor = preprocessing() files = os.listdir('tweets') all_urls = {} all_texts = {} bad_urls = [] for file in files: urls = {} texts = {} str_data = '' all_json = [] with gzip.open('tweets/' + file, 'rb') as f: # data = json.loads(f.read()) # data = json.loads(f.read(), cls=ConcatJSONDecoder) str_data = f.readlines() for line in str_data: line = json.dumps(line) all_json.append(json.loads(line)) for tweet in all_json: if not tweet.find('media_url') == -1: try: tweet = ast.literal_eval(tweet) if not str(tweet['user']["id"]) in all_news_accounts: continue if tweet['is_quote_status']: continue if 'media' in tweet['entities'] and tweet['entities'][ 'media'][-1]['type'] == 'photo': text = preprocessor.clean(tweet['text']) texts[tweet['id']] = text urls[tweet['id']] = tweet['entities']['media'][-1][ 'media_url'] elif 'media' in tweet['extended_entities'] and tweet[ 'extended_entities']['media'][-1][ 'type'] == 'photo': text = preprocessor.clean(tweet['text']) texts[tweet['id']] = text urls[tweet['id']] = tweet['extended_entities'][ 'media'][-1]['media_url'] else: continue except KeyboardInterrupt: print str(datetime.now()) + " program exit\n" raise SystemExit except KeyError as e: print str(datetime.now()) + ' ' + str(e) + '\n' traceback.print_exc() except Exception as e: print str(datetime.now()) + ' ' + str(e) + '\n' traceback.print_exc() raise SystemExit all_urls[file] = urls all_texts[file] = texts get_photos(all_urls, bad_urls) save_texts(all_texts, bad_urls)
def main(args): # Time setting total_start_time = time.time() preprocessing if args.preprocessing: preprocessing(args) # Augmentation by NER_Masking if args.augmenting: augmenting(args) # training if args.training: training(args) # Time calculate print(f'Done! ; {round((time.time()-total_start_time)/60, 3)}min spend')
def form(): resp = [] if request.method == 'POST': text_news = request.form['text'] audio = request.form['audio'] print(text_news) print(audio) if text_news == '': text_news = speech_to_text(audio) resp = preprocessing(text_news) else: resp = preprocessing(text_news) return render_template('result.html', respuesta=resp)
def search_d2v(raw_query): query = preprocessing(raw_query) query_vec = d2v_model.infer_vector(query) results = {} for item in d2v_index: this_doc_vec = d2v_index[item] results[item] = similarity(query_vec, this_doc_vec) sort_res = sorted(results.items(), key=lambda kv: kv[1], reverse=True)[:10] return [res[0] for res in sort_res]
def allResults(stockName, filename, rangeSizeDataset, rangeTrainPercent): originalDataset = files.getDataset(filename='../original-data/' + filename) preprocessedDataset1Day = ppc.preprocessing(daysAhead=1, dataset=originalDataset) preprocessedDataset5Days = ppc.preprocessing(daysAhead=5, dataset=originalDataset) preprocessedDataset22Days = ppc.preprocessing(daysAhead=22, dataset=originalDataset) # print('------ FIXED WINDOW 1 DAY -------') # rs.getResultsFixedWindow(stockName=stockName, dataset=preprocessedDataset1Day, daysAhead=1, # rangeSizeDataset=rangeSizeDataset, rangeTrainPercent=rangeTrainPercent) # print('------ FIXED WINDOW 5 DAYS -------') # rs.getResultsFixedWindow(stockName=stockName, dataset=preprocessedDataset5Days, daysAhead=5, # rangeSizeDataset=rangeSizeDataset, rangeTrainPercent=rangeTrainPercent) # print('------ FIXED WINDOW 22 DAYS -------') # rs.getResultsFixedWindow(stockName=stockName, dataset=preprocessedDataset22Days, daysAhead=22, # rangeSizeDataset=rangeSizeDataset, rangeTrainPercent=rangeTrainPercent) print('------ SLIDING WINDOW 1 DAY -------') rs.getResultsSlidingWindow(stockName=stockName, dataset=preprocessedDataset1Day, daysAhead=1, rangeSizeDataset=rangeSizeDataset, rangeTrainPercent=rangeTrainPercent) print('------ SLIDING WINDOW 5 DAYS -------') rs.getResultsSlidingWindow(stockName=stockName, dataset=preprocessedDataset5Days, daysAhead=5, rangeSizeDataset=rangeSizeDataset, rangeTrainPercent=rangeTrainPercent) print('------ SLIDING WINDOW 22 DAYS -------') rs.getResultsSlidingWindow(stockName=stockName, dataset=preprocessedDataset22Days, daysAhead=22, rangeSizeDataset=rangeSizeDataset, rangeTrainPercent=rangeTrainPercent) print('------ ALL DATA FROM ' + stockName + ' FINISHED -------')
def main(clf_name, scoring_function): classifier = None random_grid = None nelement = 10000 assert clf_name == 'RandomForest' or clf_name == 'AdaBoost' assert scoring_function == 'accuracy' or scoring_function == 'f1' or scoring_function == 'AMS' if clf_name == 'RandomForest': print("Using random forest") classifier = RandomForestClassifier() # Number of trees in random forest n_estimators = [ int(x) for x in np.linspace(start=200, stop=2000, num=10) ] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } else: print("Using AdaBoost") classifier = AdaBoostClassifier() random_grid = { 'n_estimators': [50, 100], 'learning_rate': [0.01, 0.05, 0.1, 0.3, 1] } print("Preprocessing data... ({} samples)".format(nelement)) X, y = preprocessing("data.csv", nelement=nelement) X, y = shuffle(X, y, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y) randomized_grid_search(classifier, random_grid, X_train, y_train, X_test, y_test, scoring_function)
def main(): result_file = "test_salaries.csv" model_data = preprocessing(os.path.join(data_path, train_feature_file), os.path.join(data_path, train_salary_file), os.path.join(data_path, test_feature_file), remove_cols=["companyId", "jobType"]) generate_results(model_data, result_file)
def __init__(self): self.prePro = preprocessing() self.model = model() try: pickleIn = open("model.p", "rb") self.classifier = pickle.load(pickleIn) except (OSError, IOError) as e: self.model.main() pickleIn = open("model.p", "rb") self.classifier = pickle.load(pickleIn)
def preprocessingAndFeatures(self, img, writer): preProcessing, lines = preprocessing(img) features = extractLBPLines(lines) writerList = [writer] * len(features) if writer is not None: self.featuresLabled.extend(features) self.yTrain.extend(writerList) else: self.featuresTest.extend(features)
def __init__(self): print "Reached init" preproc = preprocessing() self.trend_filter = trend_filtering() print "did stuff 1" self.in_data = preproc.get_data() print "did stuff 2" self.program_map = preproc.get_programs() print "did stuff 3" self.kmeans = kmeansclutering(self.in_data, self.program_map) print "Done with init"
def readAndTokenize(self,file_list,listindex): for file in file_list: fp=open(file,"r") word_list=preprocessing(fp.read()) for w in word_list: if w in self.dictionary.keys(): self.dictionary[w][listindex] += 1 else: newlist=[1]*4 self.dictionary[w]=newlist self.dictionary[w][listindex] += 1
def main(): fpw=open("nboutput.txt","w") classify=nbclassify() for root,direc,filelist in os.walk(sys.argv[1]): for file in filelist: if file.endswith('.txt') and len(direc)==0: fp=open(os.path.join(root,file),"r") word_list=preprocessing(fp.read()) result=classify.compute(word_list) result+=os.path.join(root,file) fpw.write(result+"\n")
def main(): ##training labels_filename = ["obama_labels.txt", "romney_labels.txt"] book = xlrd.open_workbook("training-Obama-Romney-tweets.xlsx") words_filename = ["obama_words.txt", "romney_words.txt"] for i in range(0, 2): sheet = book.sheet_by_index(i) # extract data and store it in file featurelist.txt preprocessing().dataExtraction(2, sheet.nrows, sheet, labels_filename[i], words_filename[i]) ##testing book_test = xlrd.open_workbook("testing-Obama-Romney-tweets-3labels.xlsx") labels_filename_test = ["obama_labels_test.txt", "romney_labels_test.txt"] words_filename_test = ["obama_words_test.txt", "romney_words_test.txt"] for i in range(0, 2): sheet = book_test.sheet_by_index(i) # extract data and store it in file featurelist.txt preprocessing().dataExtraction(2, sheet.nrows, sheet, labels_filename_test[i], words_filename_test[i])
def test_knn(filepath, glossary, k=5): bits = functions.gen_bitlist(6)[1:] total = len(bits) max = 0 max_tags = [] counter = 1 for tags in bits: matrix, tag_names = preprocessing(filepath, tags, glossary) r = knn(matrix, k) print(str(counter) + "/" + str(total), end='\r') if r > max: max = r max_tags = tag_names counter += 1 print("---Mejor---") print("Valoracion: " + str(max)) print("Tag names: " + str(max_tags))
def task03_pre(k): preprocessing('task03', 'english', k)
import os, sys, json dirr = os.path.dirname(sys.argv[0]) filename = os.path.join(dirr, '../../libraries') sys.path.append(filename) dirr = os.path.dirname(sys.argv[0]) filename = os.path.join(dirr, '../imageProcess') sys.path.append(filename) import preprocessing, proportion, centroid dirr = os.path.dirname(sys.argv[0]) filename = os.path.join(dirr, '../../../img/generated_canvas/verify_11.png') img = preprocessing.preprocessing(filename) proportion = proportion.proportion(img) centroidX = centroid.centroid(img)[0] centroidY = centroid.centroid(img)[1] result = { 'prop' : proportion, 'centX' : centroidX, 'centY' : centroidY, } print json.dumps(result)
def __init__(self, alpha, beta, nr_topics, skip_lda=False, orig_lda=False, remove_poprock=False): """ Initialize """ self.alpha = alpha self.beta = beta self.nr_topics = nr_topics self.skiplda = skip_lda self.orig_lda = orig_lda self.fold = 0 # Dict to save recall/precision/f1 scores for every fold self.metric_folds = {} self.metric_folds_orig_lda = {} # Preprocess data prep = preprocessing.preprocessing(dump_files=False, load_files=True, dump_clean=False, load_clean=True) # Get lyrics self.total_dataset = prep.get_dataset() self.remove_poprock = remove_poprock # Possibly remove pop/rock if self.remove_poprock: print "Remove pop/rock: %s" %(str(remove_poprock)) self.total_dataset_temp = [] for i in range(0, len(self.total_dataset)): if self.total_dataset[i]['genre'] == 'pop/rock': continue else: self.total_dataset_temp.append(self.total_dataset[i]) self.total_dataset = self.total_dataset_temp # Note: to use smaller dataset add [:set] print "total nr of lyrics:", len(self.total_dataset) labels = [] #labels_subgenre = [] label_count = {} # Get all labels of dataset for item in self.total_dataset: labels.append(item['genre']) label_count[item['genre']] = label_count.get(item['genre'], 0) +1 # Set instance variable to list of set of all labels self.all_genres = list(set(labels)) #""" UNCOMMENT TO CREATE NEW FOLD INDICES # Get kfold training and test indices (folds: 10 so that it uses 90% for training data) # Stratified 10-fold cross-validation #skf = cross_validation.StratifiedKFold(labels, n_folds=5) #self.train_indices_folds = [] #self.test_indices_folds = [] #for train_index, test_index in skf: # self.train_indices_folds.append(train_index) # self.test_indices_folds.append(test_index) #file_indices = 'train_test_indices_stratified' #if remove_poprock: # file_indices += '_notpoprock' # print "not poprock" #pickle.dump((self.train_indices_folds, self.test_indices_folds), open(file_indices,"wb+")) #sys.exit() # OR LOAD FROM PICKLE FILE: if self.remove_poprock: self.train_indices_folds, self.test_indices_folds = pickle.load(open('train_test_indices_stratified_notpoprock',"r")) else: self.train_indices_folds, self.test_indices_folds = pickle.load(open('train_test_indices_stratified',"r")) # Create the training and test set. Both are set as instance variables self.create_train_test_set(0) # Initialize counts self.genre_count = np.zeros(len(self.all_genres), dtype=int) self.topic_count = np.zeros(nr_topics, dtype=int) # Counts for original LDA if self.orig_lda: self.topic_count_orig_lda = np.zeros(nr_topics, dtype=int) self.doc_word_count_orig_lda = np.zeros(len(self.dataset), dtype=int) # Initialization of matrices and dictionaries self._initialize_lists() # Initialize counts for matrices self._initialize_counts(load=False)
def main(_): # Create training directories now = datetime.datetime.now() train_dir_name = now.strftime('alexnet_%Y%m%d_%H%M%S') train_dir = os.path.join(FLAGS.train_root_dir, train_dir_name) checkpoint_dir = os.path.join(train_dir, 'checkpoint') tensorboard_dir = os.path.join(train_dir, 'tensorboard') tensorboard_train_dir = os.path.join(tensorboard_dir, 'train') tensorboard_val_dir = os.path.join(tensorboard_dir, 'val') if not os.path.isdir(FLAGS.train_root_dir): os.mkdir(FLAGS.train_root_dir) if not os.path.isdir(train_dir): os.mkdir(train_dir) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) if not os.path.isdir(tensorboard_dir): os.mkdir(tensorboard_dir) if not os.path.isdir(tensorboard_train_dir): os.mkdir(tensorboard_train_dir) if not os.path.isdir(tensorboard_val_dir): os.mkdir(tensorboard_val_dir) # Write flags to txt flags_file_path = os.path.join(train_dir, 'flags.txt') flags_file = open(flags_file_path, 'w') flags_file.write('learning_rate={}\n'.format(FLAGS.learning_rate)) flags_file.write('dropout_keep_prob={}\n'.format(FLAGS.dropout_keep_prob)) flags_file.write('num_epochs={}\n'.format(FLAGS.num_epochs)) flags_file.write('batch_size={}\n'.format(FLAGS.batch_size)) flags_file.write('train_layers={}\n'.format(FLAGS.train_layers)) flags_file.write('multi_scale={}\n'.format(FLAGS.multi_scale)) flags_file.write('train_root_dir={}\n'.format(FLAGS.train_root_dir)) flags_file.write('log_step={}\n'.format(FLAGS.log_step)) flags_file.close() adlamb=tf.placeholder(tf.float32,name='adlamb') num_update=tf.placeholder(tf.float32,name='num_update') decay_learning_rate=tf.placeholder(tf.float32) dropout_keep_prob = tf.placeholder(tf.float32) is_training=tf.placeholder(tf.bool) time=tf.placeholder(tf.float32,[1]) # Model train_layers = FLAGS.train_layers.split(',') model = LeNetModel(num_classes=NUM_CLASSES, image_size=28,is_training=is_training,dropout_keep_prob=dropout_keep_prob) # Placeholders x_s = tf.placeholder(tf.float32, [None, 32, 32, 3],name='x') x_t = tf.placeholder(tf.float32, [None, 28, 28, 1],name='xt') x=preprocessing(x_s,model) xt=preprocessing(x_t,model) tf.summary.image('Source Images',x) tf.summary.image('Target Images',xt) print 'x_s ',x_s.get_shape() print 'x ',x.get_shape() print 'x_t ',x_t.get_shape() print 'xt ',xt.get_shape() y = tf.placeholder(tf.float32, [None, NUM_CLASSES],name='y') yt = tf.placeholder(tf.float32, [None, NUM_CLASSES],name='yt') loss = model.loss(x, y) # Training accuracy of the model source_correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(y, 1)) source_correct=tf.reduce_sum(tf.cast(source_correct_pred,tf.float32)) source_accuracy = tf.reduce_mean(tf.cast(source_correct_pred, tf.float32)) G_loss,D_loss,sc,tc=model.adloss(x,xt,y,yt) # Testing accuracy of the model correct_pred = tf.equal(tf.argmax(model.score, 1), tf.argmax(yt, 1)) correct=tf.reduce_sum(tf.cast(correct_pred,tf.float32)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) update_op = model.optimize(decay_learning_rate,train_layers,adlamb,sc,tc) D_op=model.adoptimize(decay_learning_rate,train_layers) optimizer=tf.group(update_op,D_op) train_writer=tf.summary.FileWriter('./log/tensorboard') train_writer.add_graph(tf.get_default_graph()) config=projector.ProjectorConfig() embedding=config.embeddings.add() embedding.tensor_name=model.feature.name embedding.metadata_path='domain.csv' projector.visualize_embeddings(train_writer,config) tf.summary.scalar('G_loss',model.G_loss) tf.summary.scalar('D_loss',model.D_loss) tf.summary.scalar('C_loss',model.loss) tf.summary.scalar('SA_loss',model.Semanticloss) tf.summary.scalar('Training Accuracy',source_accuracy) tf.summary.scalar('Testing Accuracy',accuracy) merged=tf.summary.merge_all() print '============================GLOBAL TRAINABLE VARIABLES ============================' print tf.trainable_variables() #print '============================GLOBAL VARIABLES ======================================' #print tf.global_variables() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver=tf.train.Saver() #saver.restore(sess,'log/checkpoint') # Load the pretrained weights #model.load_original_weights(sess, skip_layers=train_layers) train_writer.add_graph(sess.graph) # Directly restore (your model should be exactly the same with checkpoint) # saver.restore(sess, "/Users/dgurkaynak/Projects/marvel-training/alexnet64-fc6/model_epoch10.ckpt") print("{} Start training...".format(datetime.datetime.now())) #print("{} Open Tensorboard at --logdir {}".format(datetime.datetime.now(), tensorboard_dir)) gd=0 step = 1 for epoch in range(300000): # Start training gd+=1 lamb=adaptation_factor(gd*1.0/MAX_STEP) #rate=decay(FLAGS.learning_rate,gd,MAX_STEP) power=gd/10000 rate=FLAGS.learning_rate tt=pow(0.1,power) batch_xs, batch_ys = TRAIN.next_batch(FLAGS.batch_size) Tbatch_xs, Tbatch_ys = VALID.next_batch(FLAGS.batch_size) #print batch_xs.shape #print Tbatch_xs.shape summary,_,closs,gloss,dloss,smloss=sess.run([merged,optimizer,model.loss,model.G_loss,model.D_loss,model.Semanticloss], feed_dict={x_s: batch_xs,x_t: Tbatch_xs,time:[1.0*gd],decay_learning_rate:rate,adlamb:lamb,is_training:True,y: batch_ys,dropout_keep_prob:0.5,yt:Tbatch_ys}) train_writer.add_summary(summary,gd) step += 1 if gd%250==0: epoch=gd/(72357/100) print 'lambda: ',lamb print 'rate: ',rate print 'Epoch {5:<10} Step {3:<10} C_loss {0:<10} G_loss {1:<10} D_loss {2:<10} Sem_loss {4:<10}'.format(closs,gloss,dloss,gd,smloss,epoch) print("{} Start validation".format(datetime.datetime.now())) test_acc = 0. test_count = 0 print 'test_iter ',len(TEST.labels) for _ in xrange((len(TEST.labels))/5000): batch_tx, batch_ty = TEST.next_batch(5000) #print TEST.pointer,' ',TEST.shuffle acc = sess.run(correct, feed_dict={x_t: batch_tx, yt: batch_ty, is_training:True,dropout_keep_prob: 1.}) test_acc += acc test_count += 5000 print test_acc,test_count test_acc /= test_count if epoch==300: return #batch_tx, batch_ty = TEST.next_batch(len(TEST.labels)) #test_acc=sess.run(accuracy,feed_dict={x_t:batch_tx,y:batch_ty,is_training:False,dropout_keep_prob:1.0}) print len(batch_tx) print("{} Validation Accuracy = {:.4f}".format(datetime.datetime.now(), test_acc)) if gd%10000==0 and gd>0: #saver.save(sess,'./log/mstn2model'+str(gd)+'.ckpt') #print 'tensorboard --logdir ./log/tensorboard' #return pass
sys.path.append(filename) import preprocessing, proportion, centroid, meanStDev times = json.loads(sys.argv[1]) NUMDEVS = 3 imgArray = ['filename_1.png', 'filename_2.png', 'filename_3.png', 'filename_4.png', 'filename_5.png', 'filename_6.png', 'filename_7.png', 'filename_8.png', 'filename_9.png', 'filename_10.png'] dirr = os.path.dirname(sys.argv[0]) filename = os.path.abspath(os.path.join(dirr, '../../../img/generated_canvas')) centroidsX = [] centroidsY = [] proportions = [] for img in imgArray: fileTemp = os.path.join(filename, img) imgTemp = preprocessing.preprocessing(fileTemp) imgTemp.save('fart.png') proportions.append(proportion.proportion(imgTemp)) centroidsX.append(centroid.centroid(imgTemp)[0]) centroidsY.append(centroid.centroid(imgTemp)[1]) meanProps, devProps = meanStDev.meanStDev(proportions) meanCentsX, devCentsX = meanStDev.meanStDev(centroidsX) meanCentsY, devCentsY = meanStDev.meanStDev(centroidsY) meanTimes, devTimes = meanStDev.meanStDev(times) minProps = meanProps - (NUMDEVS * devProps) maxProps = meanProps + (NUMDEVS * devProps) resultsProps = [minProps, maxProps]
def processing(): # Reading the configfile specifications = readConfig() number = specifications[0] filenames = specifications[1] species = specifications[2] horizontalGridLat = specifications[3] horizontalGridLon = specifications[4] verticalGrid = specifications[5] output = specifications[6] retrieval = specifications[7] airden = specifications[8] airdenfiles = specifications[9] apriori = specifications[10] aks = specifications[11] try: akssza = specifications[12] except IndexError: akssza = None modeloutputFilenames = [] airdenFilenames = [] cache = [] if airden == "False" or airden == "false" or airden == "0" or airden == "no" or airden == "No": airden = False else: airden = True aksNew = [] apriori = pr.preprocessing(apriori) for elem in aks: print("Preprocessing {}".format(elem)) aksNew.append(pr.preprocessing(elem)) for i in range(number): print("Processing ASCII-Files: {}/{}".format(i + 1, number)) if airden: print("Processing Air density..") airdenFilenames.append(density.airden(airdenfiles[i], verticalGrid, output)) print("Processing Air density: Done!") nameOfFile = a2d.ipAscii(filenames[i], species, horizontalGridLat, horizontalGridLon, verticalGrid, output) print("Interpolating the model..") cache = pm.interpolateModel(retrieval, species, nameOfFile, verticalGrid, output) modeloutputFilenames.append(cache) print("Interpolation: Done!") print("Processing ASCII-Files: Done!") print("Computing total column...") for i in range(number): print(aksNew[i]) tc.createTC( filenames[i], apriori, species, output, modeloutputFilenames[i][0], airdenFilenames, akssza, aksNew[i] ) print("Computing total column: Done!") print("Delete temporary files ...") f = open("./config.dat", "r") directory = f.readlines()[-2].split(":")[1].rstrip() for temporary in os.listdir(directory): if "PROCESSED" in temporary: os.remove("{}{}".format(directory, temporary)) print("Delete temporary files: Done!") return
__author__ = 'root' import numpy as np import theano import theano.tensor as T import os.path import matplotlib.pyplot as plot import time import RNN import preprocessing preproc = preprocessing.preprocessing() sequenceLengthFileName = "data/out/sequence.csv" sortNoShuffleTrainFileName = "data/out/trainNoShuffle.csv" map48FileName = "data/out/map.csv" if not os.path.isfile(sequenceLengthFileName): print("Generating sequence file...") preproc.generatingSequence(sortNoShuffleTrainFileName,sequenceLengthFileName) preproc.loadTrainFile(sortNoShuffleTrainFileName) preproc.load48Map(map48FileName) layers = [69,128,48] learningRate = 0.0001 x = T.matrix("x") y = T.matrix("y") memoryInitail = T.vector("memoryInitail")