def run_pipeline(file): if type(file) is str: # One file # Read raw data from yfinance api df = pd.read_csv(file) # preprocessing on raw data df = preprocess(df) # removing nan values df = fill_nans(df) else: # Multiple files # Read raw data from yfinance api for first file in list df = pd.read_csv(file[0]) # preprocessing on raw data df = preprocess(df) # removing nan values df = fill_nans(df) # looping over the rest of the files in the list for i in range(1, len(file)): # Read raw data next_df = pd.read_csv(file[i]) # preprocess next_df = preprocess(next_df) # fill nans next_df = fill_nans(next_df) # concatenate to df # inner join: only shared columns are kept df = pd.concat([df, next_df], join="inner") # now data is prepared to model return df
def do_all(subfile="EEGbears.csv"): cols = ["HandStart", "FirstDigitTouch", "BothStartLoadPhase", "LiftOff", "Replace", "BothReleased"] ids_tot = [] pred_tot = [] for subject in range(1, 13): features_train, labels_train, nevents, _, ntrtimes, ica, FTtstep, _ = preprocessing.preprocess(subject=subject) # train classifiers. Note we can't use just one classifier object # because some events overlap so we want to be able to predict combinations of classes classifiers = [SKLearnClf() for event in range(nevents)] for event in range(nevents): classifiers[event].fit(features_train, labels_train[:, event]) # read and prepare test data features_test, _, _, _, ntesttimes, _, _, ids = preprocessing.preprocess(subject=subject, train=False, ica=ica) ids_tot.append(ids) # get predictions for individual time steps ntimebins = features_test.shape[0] predlabels = np.zeros((ntimebins, nevents)) for event in range(nevents): predlabels[:, event] = classifiers[event].predict_proba(features_test)[:, 1] predevents = preprocessing.labels_to_events(predlabels, FTtstep, ntesttimes) pred_tot.append(predevents) print("Finished subject " + str(subject) + ".") # create pandas object for sbmission, write to file submission = pd.DataFrame(index=np.concatenate(ids_tot), columns=cols, data=np.concatenate(pred_tot)) submission.to_csv(subfile, index_label="id", float_format="%.3f") return submission
def main(): ''' Builds the stemmed and unstemmed vocabularies for 3 corpuses. ''' print('Building Vocabulary...', end=' ') for fileName in fileNames[:-1]: # we don't need recipe_links vocab, autocompleteVocab = set(), set() # one stemmed, one unstemmed with open("corpus/{}.json".format(fileName), 'r') as f: stringList = json.load(f) docList, autocompleteDocList = [], [] for i in range(len(stringList)): docList.append(preprocess(stringList[i])) autocompleteDocList.append( preprocess(stringList[i], stem=False)) for word in docList[i]: vocab.add(word) for word in autocompleteDocList[i]: autocompleteVocab.add(word) with open('vocabulary/{}.json'.format(fileName), 'w') as vocabFile: json.dump(list(vocab), vocabFile) # set isn't serializable with open('vocabulary/{}_autocomplete.json'.format(fileName), 'w') as vocabFile: json.dump(list(autocompleteVocab), vocabFile) # set isn't serializable print('Done.')
def run(command, blocks, preprocessed): if command.parms: # Looping Array if command.parms[0].startswith("[") and command.parms[0].endswith("]"): arr = command.parms[0].replace("[", "").replace("]", "").split("|") arr = [x.lstrip(" ").strip(" ") for x in arr] var = command.parms[1] if len(command.parms) > 1 else None blocks = "\n".join([" "*block.spaces + block.text for block in blocks]) total = [] for x in arr: copy_blocks = blocks.replace(var, x) if var else blocks total.append(process(preprocess(copy_blocks, False))) return "\n".join(total) # Looping Number (Range) else: times = 0 try: times = int(command.parms[0]) except: return None var = command.parms[1] if len(command.parms) > 1 else None blocks = "\n".join([" "*block.spaces + block.text for block in blocks]) total = [] for i in range(times): copy_blocks = blocks.replace(var, str(i+1)) if var else blocks total.append(process(preprocess(copy_blocks, False))) return "\n".join(total)
def main(settings, metrics): #Begin processing validation images # troubled_ones = [3, 14, 22, 43, 66, 83, 97, 114, 161] # troubled_ones = [137] for i in range(0, len(settings['validation_files'])): # for i in troubled_ones: if 'Rink-Isbrae' in settings['validation_files'][ i] or 'Upernavik' in settings['validation_files'][ i] or 'Umiammakku' in settings['validation_files'][ i] or 'Inngia' in settings['validation_files'][i]: # if 'Inngia' in settings['validation_files'][i]: # if i == 62: preprocess(i, settings, metrics) process(settings, metrics) postprocess(settings, metrics) # break #Print statistics # print_calfin_domain_metrics(settings, metrics) # print_calfin_all_metrics(settings, metrics) # plt.show() return settings, metrics
def __load_tempfile(self, doc_id, sentence, movie_id): preprocessed = preprocessing.preprocess( sentence, stemming=self.activate_stemming, stop=self.activate_stop) preprocessed = list(filter(None, preprocessed)) word_count = len( preprocessing.preprocess(sentence, stemming=False, stop=False)) for term in set(preprocessed): positions = [ n for n, item in enumerate(preprocessed) if item == term ] self.temp[term] = self.temp.get(term, { 'term': term, 'doc_count': 0, 'movies': dict() }) self.temp[term]['doc_count'] += 1 self.temp[term]['movies'][movie_id] = self.temp[term][ 'movies'].get(movie_id, { '_id': movie_id, 'doc_count': 0, 'sentences': list() }) self.temp[term]['movies'][movie_id]['doc_count'] += 1 self.temp[term]['movies'][movie_id]['sentences'].append({ '_id': doc_id, 'len': word_count, 'pos': positions })
def main(): logger.info("Execution Started!!!") if DOWNLOAD_RAW_DATA: fetch_and_save_raw_data() if CREATE_RAW_DATASET: if not read_all_data(): logger.error( "Execution abruptly stopped while creating raw dataset!!!") return try: train_data = pd.read_csv(os.path.join(RAW_DATA_DIR, "raw_train_data.csv"), encoding="utf-8") if SIMPLE_PROCESSING_TYPE: logger.info("Performing simple text processing.") train_data_simple = preprocess(train_data) if type(train_data_simple) == pd.core.frame.DataFrame: train_data_simple.to_csv( os.path.join(SIMPLE_PROCESSED_DATA_DIR, "train_data_simple.csv"), index=False, encoding="utf-8", ) else: logger.error("Unable to write simple processed data!!!") return if COMPLEX_PROCESSING_TYPE: logger.info("Performing complex text processing.") train_data_complex = preprocess(train_data, preprocess_type="complex") if type(train_data_complex) == pd.core.frame.DataFrame: train_data_complex.to_csv( os.path.join(COMPLEX_PROCESSED_DATA_DIR, "train_data_complex.csv"), index=False, encoding="utf-8", ) else: logger.error("Unable to write complex processed data!!!") return if VECTORIZE_DATA_SIMPLE: logger.info("Vectorizing simple processed data.") if VECTORIZE_DATA_COMPLEX: logger.info("Vectorizing complex processed data.") if not vectorize_data( os.path.join(COMPLEX_PROCESSED_DATA_DIR, "train_data_complex.csv"), "complex", ): logger.error( "Execution abruptly stopped while vectorizing complex data!!!" ) return except Exception as e: logger.error("Exception in main method : {}".format(str(e))) return logger.info("Execution successfully completed.")
def crossvalidation(subject=1): time.clock() features_train, labels_train, nevents, _, ntrtimes, ica, FTtstep, _ = preprocessing.preprocess( subject=subject, series=range(1, 7) ) preptime = time.clock() print("Preprocessing took " + str(preptime) + " seconds.") # train classifiers. Note we can't use just one classifier object # because some events overlap so we want to be able to predict combinations of classes classifiers = [SKLearnClf() for event in range(nevents)] for event in range(nevents): classifiers[event].fit(features_train, labels_train[:, event]) traintime = time.clock() - preptime print("Trained the classifiers in " + str(traintime) + " seconds.") # read and prepare test data features_cv, labels_cv, _, events_cv, ncvtimes, _, _, _ = preprocessing.preprocess( subject=subject, train=True, series=range(7, 9), ica=ica ) events_cv = events_cv.astype(int) # I don't know why but it's an object array before this # separate some data for cross-validation # features_train, features_cv, labels_train, labels_cv = cross_validation.train_test_split( # features, labels, test_size = 0.3) # naively score classifiers on training set trscores = np.zeros((nevents)) for event in range(nevents): trscores[event] = classifiers[event].score(features_train, labels_train[:, event]) print("Scores on training set in binned time: " + str(trscores)) # naively score classifiers on CV set testscores = np.zeros((nevents)) for event in range(nevents): testscores[event] = classifiers[event].score(features_cv, labels_cv[:, event]) print("Scores on CV set in binned time: " + str(testscores)) # generate ROC curves for CV set in binned time predlabels_cv = np.transpose([classifiers[e].predict_proba(features_cv)[:, 1] for e in range(nevents)]) rocscoresbinned = ROCcurve(predlabels_cv, labels_cv) print("For binned time...") print("Areas under ROC curves:") print(rocscoresbinned) print("Average ROC score:" + str(np.mean(rocscoresbinned))) # generate ROC curves for CV set in real time predevents_cv = preprocessing.labels_to_events(predlabels_cv, FTtstep, ncvtimes) rocscoresreal = ROCcurve(predevents_cv, events_cv) print("For real time...") print("Areas under ROC curves:") print(rocscoresreal) print("Average ROC score:" + str(np.mean(rocscoresreal))) return predevents_cv, events_cv, features_cv, classifiers, np.mean(rocscoresbinned), np.mean(rocscoresreal)
def load_preprocessed_data(path, training=True): if not training: id, text = load_data(path, training) t = text.apply(lambda x: preprocess(x)) return id.values.tolist(), t.values.tolist() else: id, text, label = load_data(path, training) t = text.apply(lambda x: preprocess(x)) l = label.apply(lambda x: emotion2label[x]) return id.values.tolist(), t.values.tolist(), l.values.tolist()
def getData(filePath): """ @brief function to get the training data. Please @param filePath path to csv file from unity project. @returns augmented training Data and labels. """ lines = [] with open(filePath) as csvfile: reader = csv.reader(csvfile) for line in reader: lines.append(line) X_train = [] measurements = [] count = 0 for line in lines: source_path = line[0:3] # filename = source_path.split('/')[-1] measurement = float(line[3]) for path in source_path: image = imread(path) # if steering angle is 0 if 0.00001 > measurement > -0.00001: # replace with a randomized value between -0.1 and +0.1 randomSteer = np.random.random() * 0.01 - 0.005 # take every 15th value with 0.0 as steer angle if count % 15 == 0: measurements.append(measurement + randomSteer) X_train.append(preprocessing.preprocess(image)) count = count + 1 else: # Limit model from applying full steering. if measurement > 0.9: measurement = 0.9 if measurement < -0.9: measurement = -0.9 measurements.append(measurement) # transform the image and augment # augmentation is done only for track images with curves. proc = preprocessing.preprocess(image) X_train.append(proc) aug = [] if -0.4 > measurement or measurement > 0.30: aug = transforms.augmentData(image, 1) if -0.9 >= measurement or measurement > 0.50: aug += transforms.augmentData(image, 1) if -0.6 > measurement > -0.9: aug += transforms.augmentData(image, 1) # append augmented data into training set. for im in aug: proc = preprocessing.preprocess(im) X_train.append(proc) measurements.append(measurement) X_train = np.array(X_train) y_Train = np.array(measurements) return X_train, y_Train
def main(): '''Main function to use from commandline, preprocess input to generate embeddings, detect agression clauses using provided approach, extract features and labels from training and features from input data, trains a model and classifies test data using the trained model, evaluates predictions and goldlabels from input''' inputfile = 'sample_input.xls' preprocess(inputfile) get_predictions_rulebased() get_predictions_ml() ### only clusters with enough data, else everything in outlier cluster cluster_precursors()
def clf_predict(clf, image): X = preprocess(image) prediction = clf.predict(X) if isinstance(clf, keras.Model): prediction += clf.predict(preprocess(-image)) prediction += clf.predict(preprocess(np.rot90(image, axes=(1, 2)))) prediction += clf.predict(preprocess(np.rot90(image, axes=(2, 1)))) prediction = np.argmax(prediction) else: prediction = prediction[0] return prediction
def load_everything(): # loadign scheme connection_string = " user = '******' password = '******' host = '127.0.0.1' port = '5432' dbname = 'project' " conn = psycopg2.connect(connection_string) with conn.cursor() as cursor: setup_queries = open('schema.sql', 'r').read() cursor.execute(setup_queries) conn.commit() # preprocess the data preprocessing.preprocess() # loading csv into database load.load_into_database()
def main(): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('start preprocessing data from raw:') if FLAGS.tweets: preprocessing.preprocess_tweets(save=True) else: preprocessing.preprocess() logger.info('files have been created in data/processed')
def test(categories): use_stem = True test_data = [] test_labels = [] tsv_out1 = open(".\\src\\test.tsv", "wb") tsv_out = csv.writer(tsv_out1, delimiter='\t') test_json = open(".\\src\\test.json") count_all = Counter() for r in test_json: tweet = json.loads(r) if (tweet["lang"] != "ru"): continue # Create a list with all the terms terms_all = [term for term in preprocess(tweet['text'], True)] # Update the counter count_all.update(terms_all) # tokens = preprocess(tweet['text'], True) # for token in tokens: # print token # print tweet["text"].encode(sys.stdout.encoding, errors='replace') # tsv_out.writerow(["hz", tweet["text"].encode("utf-8")]) for token in count_all.most_common(5): print(token[0] + ":" + str(token[1])) exit() tsv_out1.close() # exit(0) # test_in = open(".\\data\\parsed\\ttk_train.tsv") test_in = open(".\\src\\test.tsv") test_in = csv.reader(test_in, delimiter='\t') fin1 = open('vectorizer.pk', 'r') vectorizer = pickle.load(fin1) fin2 = open('classifier_linear.pk', 'r') classifier_linear = pickle.load(fin2) test_data, test_labels = preprocess(test_in, use_stem) test_vectors = vectorizer.transform(test_data) prediction_linear = classifier_linear.predict(test_vectors) print("Results for SVC(kernel=linear)") print(classification_report(test_labels, prediction_linear)) with open("result_linear_test.txt", "wb") as result_out: i = 0 for s in prediction_linear: if (test_labels[i] != prediction_linear[i]): result_out.write( test_labels[i] + " : " + prediction_linear[i] + '\t' + test_data[i].encode("utf-8") + '\n') i += 1
def all_to_filtered(doc_not_filtered_path, out_path, line_map_path): """map after filtering to before filtering""" assert os.path.isfile(doc_not_filtered_path), 'invalid doc_not_filtered_path: %s' % doc_not_filtered_path assert os.path.isdir(out_path), 'invalid out_path: %s' % out_path line_map_folder, name = os.path.split(line_map_path) assert os.path.isdir(line_map_folder), 'invalid line_map_folder: %s' % line_map_folder assert name.strip() != '', 'empty line_map_path name' filters = [docfilters.remove_doctests, docfilters.keep_first_description, docfilters.remove_wx_wrappers, docfilters.remove_parameter_descriptions, docfilters.replace_vertical_bars] preprocessing.preprocess(doc_not_filtered_path[:-4], out_path, False, filters, line_map_path)
def on_data(self, data): try: all_data = json.loads(data) # tweets are dumped on the system in the json format tweet_text = all_data["text"] # only test part of the tweets will be fetched tweet = clean_tweet(tweet_text) # further cleaning of the tweets preprocess(tweet) print(all_data) #---------------------------------------------- except: return True
def main(): t = time() text = "Agar shi chala toh ye nhi dikhega Bidu" base_path = 'temp_images/' for file in os.listdir(base_path): try: os.remove(base_path + file) except FileNotFoundError: print('{} not deleted. \n'.format(file)) continue ip_image_path = 'braille_scan.jpg' # ip_image_path = '/Users/ayushi/Desktop/uhack/braille_scan.jpg' try: os.remove('results.txt') except FileNotFoundError: print('results file not found.') preprocessed_image = base_path + 'preprecessed.jpg' try: preprocess(base_path, ip_image_path, preprocessed_image) except Exception as e: print(e) # preprocessing(ip_image_path, preprocessed_image) try: n_lines = horizontal_segmentation(base_path, preprocessed_image) vertical_segmentation(base_path, n_lines) except Exception as e: print(e) try: file = open('results.txt', 'r') text = file.read() file.close() print(text) error_removal() file = open('results.txt', 'r') text = file.read() file.close() print(text) except Exception as e: print(e) print(time() - t) return text
def run_pipeline(): """ Runs all functions in the pipeline. Parses tracking and events data from 52 xml files. Preprocesses the DataFrame to conform to Metrica Sports format. Then calculates EPV values to get the optimal passes using the Friends Of Tracking code, which can be found in the EPV_code folder. Then creates multiple features based on tracking and events data. Followed by the analysis, using a Linear Regression and Decision Tree. After each step files are saved to the /data folder. """ data_parser.parse_data() preprocessing.preprocess() generate_EPV_values.generate_epv_files() feature_engineering.engineer_features() analysis.run_analysis()
def detect_plagiarism(path1, path2, categories): '''The main function initialized to detect plagiarism.''' preprocess(path1) preprocess(path2) path1 = os.path.join(os.getcwd(), 'temp', remove_suffix(os.path.basename(path1))) path2 = os.path.join(os.getcwd(), 'temp', remove_suffix(os.path.basename(path2))) details, summary = get_summary(categories, path1, path2) return details, summary
def main(settings, metrics): #Begin processing validation images # troubled_ones = [3, 14, 22, 43, 66, 83, 97, 114, 161] # troubled_ones = [161] for i in range(0, len(settings['validation_files'])): # for i in troubled_ones: preprocess(i, settings, metrics) process(settings, metrics) postprocess(settings, metrics) #Print statistics print_calfin_domain_metrics(settings, metrics) print_calfin_all_metrics(settings, metrics) return settings, metrics
def bm25_classifier(query, descriptions, labels): """ Computes BM25 scores of a given query in relation to all selected and preprocessed datasets and selects all datasets that exeed the threshold mean+3*sd. input: query and list of lables, output: list of labels that fit the query """ preprocessed_descriptions = [] for description in descriptions: preprocessed_descriptions.append( preprocessing.preprocess(str(description))) tokenized_corpus = [doc.split(" ") for doc in preprocessed_descriptions] bm25_modell = BM25Plus(tokenized_corpus) tokenized_query = query.split(" ") scores = bm25_modell.get_scores(tokenized_query) mean_scores = mean(scores) standard_deviation_scores = stdev(scores) selected = [] for i in range(0, len(descriptions)): label = labels[i] description = descriptions[i] score = scores[i] if score > (mean_scores + 4 * standard_deviation_scores): selected.append(label) return selected
def uploaded_file(filename): import preprocessing img = preprocessing.preprocess() import pytess ans = pytess.test('pre.tif') return render_template("result.html", org_img=filename, ans=ans)
def telemetry(sid, data): if data: # The current steering angle of the car steering_angle = data["steering_angle"] # The current throttle of the car throttle = data["throttle"] # The current speed of the car speed = data["speed"] # The current image from the center camera of the car imgString = data["image"] image = Image.open(BytesIO(base64.b64decode(imgString))) image_array = np.asarray(image) # Preprocess image before prediction image_array = preprocessing.preprocess(image_array) steering_angle = float( model.predict(image_array[None, :, :, :], batch_size=1)) throttle = controller.update(float(speed)) print(steering_angle, throttle) send_control(steering_angle, throttle) # save frame if args.image_folder != '': timestamp = datetime.utcnow().strftime('%Y_%m_%d_%H_%M_%S_%f')[:-3] image_filename = os.path.join(args.image_folder, timestamp) image.save('{}.jpg'.format(image_filename)) else: # NOTE: DON'T EDIT THIS. sio.emit('manual', data={}, skip_sid=True)
def get_batch(X_train, y_train): """ Generate the batch for training with the data (X_train) and the corresponding grouth truth X_train(names):center_img.strip(), left_img.strip(), right_img.strip() y_train:angle, angle+steer_offset, angle-steer_offset Returns: A list of image(filenames) and steeringAngles -> only for one batch """ imgList = np.zeros((BATCH_SIZE, 66, 200, 3), dtype=np.float32) steeringAngleList = np.zeros((BATCH_SIZE, ), dtype=np.float32) while True: for i in range(BATCH_SIZE): lowAngle_counter = 0 # Get a valid angle (not low angle when the percentage of lowangle in the batch is already exceeded) while True: imgFileName, angle = getRandImgAndAngle(X_train, y_train) if not validAngle(angle, lowAngle_counter, BATCH_SIZE): # Get a new data -> this one is not working continue else: # Fine. Increase and you shall pass ;) lowAngle_counter += 1 break # Read image image = cv2.imread(imgFileName) # Preprocess image = preprocess(image) # Flip the image (sometimes) imgList[i], steeringAngleList[i] = flipImg(image, angle) yield imgList, steeringAngleList
def generate_feature_matrix(X_train, X_dev, X_test, preprocessing=False, remove_stopwords=False, min_df=1): if preprocessing: X_train = preprocess(X_train, remove_stopwords) X_dev = preprocess(X_dev, remove_stopwords) X_test = preprocess(X_test, remove_stopwords) X_train = create_dataframe_for_training(X_train) X_dev = create_dataframe_for_training(X_dev) X_test = create_dataframe_for_training(X_test) vectorizer = CountVectorizer(min_df=min_df) X_train_fe = vectorizer.fit_transform(X_train) X_valid_fe = vectorizer.transform(X_dev) X_test_fe = vectorizer.transform(X_test) return X_train_fe.toarray(), X_valid_fe.toarray(), X_test_fe.toarray()
def get_contours(image, median_size=5): """ :param image: The original image, in which you wanna reduce the noise. :param median_size: the matrix dimensions of the median filter :return: numpy array includes the contours in this image :draw: the original image with the contours detected and drawn on it """ image_with_noise = image image = preprocess(image, median_size) blurred = cv2.pyrMeanShiftFiltering(image, 31, 91) gray = cv2.cvtColor(blurred, cv2.COLOR_BGR2GRAY) ret, threshold = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) #ret, threshold = cv2.threshold(gray, 127, 255, 1) _, contours, _ = cv2.findContours(threshold, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE) #_, contours, _ = cv2.findContours(threshold,1,2) cv2.drawContours(image_with_noise, contours, -1, (0, 0, 255), 6) cv2.namedWindow("Contours Detection", cv2.WINDOW_NORMAL) cv2.imshow("Contours Detection", image_with_noise) cv2.waitKey() return contours
def run_predictions(dataframe, test_size, selected_model, parameters, metrics, cross_val, cv_k): """Puts together preprocessing, training and testing.""" st.markdown(":chart_with_upwards_trend: Hyperparameters used: ") st.write(parameters) if cross_val: st.warning( "Warning, only the first metric is selected when using Cross Validation." ) # Preprocessing data x, y = preprocessing.preprocess(dataframe) st.success("Preprocessing completed!") model = get_model(selected_model, parameters) if cross_val: # model = get_model(selected_model, parameters) cross_validation(model, x, y, cv_k, metrics[0]) else: # Training the model train_status = st.warning("Training model..") X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=test_size) # model = get_model(selected_model, parameters) model.fit(X_train, y_train) train_status.success("Training completed!") # Testing the model test_status = st.warning("Testing model..") test_model(model, X_train, y_train, X_test, y_test, metrics) test_status.success("Testing completed!")
def main(): df_data = preprocessing.get_data_db('../../sample_3days.sqlite') preprocessing.fill_missing_values(df_data) df_data = preprocessing.preprocess(df_data) #df_data.drop(preprocessing.dict_category_columns['text'], axis=1, inplace=True) target_columns = preprocessing.target_columns feature_columns = [ column for column in df_data.columns if column not in target_columns ] target_columns = ['ups'] print(target_columns, '\n') print(feature_columns) indices = np.arange(len(df_data)) splits = {} splits['train'], splits['test'] = train_test_split(indices, test_size=0.1) splits['train'], splits['validation'] = train_test_split(splits['train'], test_size=0.1) model = models.get_model() model = training.train(model, df_data[feature_columns], df_data[target_columns], splits) print(model.score(df_data[feature_columns], df_data[target_columns]))
def Main(): fp = csv.reader(open('../../DataSet/Tweets_sample2.csv', 'rb'), delimiter=',', quotechar='"') #fp = open('../../Data_Set/kaggle.txt','rb') data_list = processing_data_set.data_list(fp) random.shuffle(data_list) feature_list = [] cnt = 0 print len(data_list) for i in data_list: i[0] = preprocessing.preprocess(i[0]) i[0] = all_words.to_all_words(i[0]) i[0] = feature_words.to_feature_words(i[0]) #print i[1] feature_list.extend(i[0]) #print len(feature_list) #Preparing a dictionary of words in feature_list and maintaining their count dic = {} for i in feature_list: if i in dic: dic[i] = dic[i] + 1 else: dic[i] = 1 print len(dic) #Reverse sorting the dictionary to get most frequently used words #print dic feature_list = sorted(dic, key=dic.__getitem__, reverse=True)[:3000] print "Length of feature list ", len(feature_list) #print feature_list[:15] ''' data_list2 = [] for i in data_list: data_list2.append((i[0],i[1])) ''' #classifier_Self_NB.call_NB(data_list,feature_list) classifier2.NB_classifier(data_list, feature_list)
def process_all(files): count = 0 geven_images = [] geven_char = [] for img_path, txt_path in tqdm(files): txt = get_text(txt_path) # if len(re.sub(r'[\u061F-\u066A|\s]', "", txt) ) != 0: continue img = cv2.imread(img_path) linesOfWords, numWords, linesImages = preprocess(img) if numWords != len(txt.split(' ')): continue words = txt.split(' ') words_ind = 0 for l, line_img in enumerate(linesOfWords): for w, word_img in enumerate(line_img): word = words[words_ind] count += len(word) words_ind += 1 img_indx = len(word_img[0]) geven_images += [word_img] geven_char += [word[0]] # for i, char in enumerate(word): # if i == 0: # img_indx -= characterDict[char][0] # char_img = word_img[:,img_indx:img_indx+characterDict[char][0]] # elif i == len(word) - 1: # img_indx -= characterDict[char][2] # char_img = word_img[:,img_indx:img_indx+characterDict[char][2]] # else: # img_indx -= characterDict[char][1] # char_img = word_img[:,img_indx:img_indx+characterDict[char][1]] # if char_img.shape[1] == 0: continue # # cv2.imwrite(os.path.join('dataset','chars', f"{txt_path.split('/')[-1].split('.')[0]}_{words_ind-1}_{l}_{w}_{i}_{ord(char)}.png"), char_img) print(len(geven_images)) return geven_images, geven_char
def semantics(doc): prep = preprocess(doc) return ( flatten( prep.pos_tags() ), prep.noun_phrases(), flatten( prep.get_entities() ) )
def train_neural_network(path): '''Didn't have this working for the hackathon''' X_train, X_test, X_val, y_val, y_train, y_test = preprocess(path) def change(x): if x == -1: return 2 if x == 0: return 0 if x == 1: return 1 print(type(y_train)) #neural network doesn't allow - numbers in labels so changed all -1 to 2 y_train = np.asarray(list(map(change, y_train))) y_test = np.asarray(list(map(change, y_test))) y_val = np.asarray(list(map(change, y_val))) model = tf.keras.models.Sequential([ tf.keras.Input(shape=(35400)), tf.keras.layers.Dense(32, activation='relu'), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(3, activation='softmax') ]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=6) evaluation = model.evaluate(X_test, y_test) accuracy = evaluation[1] model.save('sequential_32_Dropout.h5') return accuracy
def run_eda(df_bank, df_fb, df_retail): ''' a function to run the components of eda ''' data = p.preprocess(df_bank, df_fb, df_retail) data.merge() data.clean() data.add_count() df_final = data.df_final # plot the count of each category count_plot(df_final) # plot the distribution of the categories dist_plot(df_final) # plot the distribution of the word counts word_count_plot(df_final) # plot the top n words for each category complaint_count = data.top_n_words( df_final[(df_final["Complaint"] == 1)]["Text"]) plot_top_n_words(complaint_count, "Complaint") compliment_count = data.top_n_words( df_final[(df_final["Compliment"] == 1)]["Text"]) plot_top_n_words(compliment_count, "Compliment") suggestion_count = data.top_n_words( df_final[(df_final["Suggestion"] == 1)]["Text"]) plot_top_n_words(suggestion_count, "Suggestion") plt.show()
def _build_bin_dict(self): """ Build a binary dictionary containing all terms and the value 1.0 to indicate their presence in the data set. """ if self.data: return dict((preprocess(term), 1.0) for term in self.data) return {}
def _build_tf_dict(self): """ Build a term-frequency dictionary containing terms and their counts. """ if self.data: d = defaultdict(int) for term in self.data: normalized = preprocess(term) d[normalized] += 1 if len(d.values()) > 0: self._max_score = max(d.values()) return d return {}
def corpus_generation(): filelist = glob.glob('../../web_spider/bilingual_article/正常语料/[0-9]*-*') chinese_corpus = [] chinese_corpus_article_tokens = [] chinese_corpus_tokens = [] for file in filelist: text = fileload(file) chinese,english = preprocess(text) chinese_corpus.extend([chinese[1:]]) for article in chinese_corpus: for sentence in article: chinese_corpus_article_tokens.extend(jieba.lcut(sentence,cut_all=False)) chinese_corpus_tokens.append(chinese_corpus_article_tokens) chinese_corpus_article_tokens=[] return chinese_corpus_tokens
def main(argv): malice_file = open(argv[1]) malice_text = malice_file.read() alice_path = "" if(argv[1].find("/") != -1): alice_path = argv[1][:(argv[1].rindex("/")+1)] malice_text = PS.preprocess(malice_text, alice_path) parsingTree = parsing.getTree(malice_text).asList() lexer = Lexer() lexer.addMaliceTokens() parsingTree = lexer.replaceInTree(parsingTree) assemblyFile = argv[1][:(argv[1].rindex('.'))] + ".asm" if(semantics.check(parsingTree)): CG.generate(parsingTree, assemblyFile)
def main(run = 1, force_run = False): mkdir(_model_folder) if not force_run and len(os.listdir(_model_folder)) > 0: ans = input("Found something in '%s', which may be overwitten.\nProceed? [y/n]: "%_model_folder) if ans.lower() == 'n': exit(-1) for k in range(run): samples = preprocessing.tp_sample.get_samples(_sample_folder) if _name_filter is not None: samples = [s for s in samples if s.batch_name in _name_filter] print(np.var([get_label(s) for s in samples])) random.shuffle(samples) batches = preprocessing.batch_data(samples, cross_valid) for i in range(cross_valid): valid_samples = batches[i] train_samples = [] savedir = "%s/%d/"%(_model_folder, i+1) mkdir(savedir) for j in range(cross_valid): if j != i: train_samples.extend(batches[j]) if _filter_samples: train_samples = preprocessing.score_portion(train_samples, get_label, _high_portion, _low_portion) train_texts = [sample.text for sample in train_samples] valid_texts = [sample.text for sample in valid_samples] train_matrix, valid_matrix, words = preprocessing.preprocess(train_texts, valid_texts, savedir = savedir, **_strategy_parameters) train_labels = np.asarray([get_label(sample) for sample in train_samples]) valid_labels = np.asarray([get_label(sample) for sample in valid_samples]) model, valid_mse = None, None if _model_type == "NN": model = Neural_Network(_attributes, _hidden_nodes = hidden_nodes, _learning_rate = learning_rate) valid_mse = model.train(train_matrix, train_labels, valid_matrix, valid_labels, max_iter = 15000) else: model = SVR(**_svm_parameters) valid_mse = model.train(train_matrix, train_labels, valid_matrix, valid_labels) model.save(savedir) model.destroy() print("Fold %2d: %.4f"%(i+1, valid_mse))
def prepare_data(f, categories, lowercase=True, stemming=False): data = [] labels = [] for row in f: if row[0] in categories: # if (ctg == "negative"): # tokens = preprocess(row[1].decode("utf-8"), lowercase, False) # else: tokens = preprocess(row[1].decode("utf-8"), lowercase, stemming) new_str = " ".join([token for token in tokens]) data.append(new_str) if (row[0] == 'negative'): labels.append(-1) elif (row[0] == 'neutral'): labels.append(0) elif (row[0] == 'positive'): labels.append(1) return data, labels
def corpus_generation(): '''将原始的爬虫数据转换成篇章语料库 返回的是一个list数据,list里的每个元素都是 一个没有标题的已分词的文章内容,未进行深加工 ''' filelist = glob.glob('../../web_spider/bilingual_article/正常语料/[0-9]*-*') english_corpus = [] english_corpus_article_tokens = [] english_corpus_tokens = [] for file in filelist: text = fileload(file) chinese,english = preprocess(text) english_corpus.extend([english[1:]]) for article in english_corpus: for sentence in article: english_corpus_article_tokens.extend(nltk.word_tokenize(sentence)) english_corpus_tokens.append(english_corpus_article_tokens) english_corpus_article_tokens = [] return english_corpus_tokens
def main(run = 1, force_run = False): mkdir(_model_folder) if not force_run and len(os.listdir(_model_folder)) > 0: ans = input("Found something in '%s', which may be overwitten.\nProceed? [y/n]: "%_model_folder) if ans.lower() == 'n': exit(-1) for k in range(run): samples = preprocessing.tp_sample.get_samples(_sample_folder) if _name_filter is not None: samples = [s for s in samples if s.batch_name in _name_filter] print("Variance: %.3f"%np.var([get_label(s) for s in samples])) random.shuffle(samples) batches = preprocessing.batch_data(samples, _cross_valid) for i in range(_cross_valid): valid_samples = batches[i] train_samples = [] savedir = "%s/%d/"%(_model_folder, i+1) mkdir(savedir) for j in range(_cross_valid): if j != i: train_samples.extend(batches[j]) train_texts = [sample.comment for sample in train_samples] valid_texts = [sample.comment for sample in valid_samples] train_matrix, valid_matrix, words = preprocessing.preprocess(train_texts, valid_texts, savedir = savedir, **_strategy_parameters) #print("\tBag of words: %d"%len(words)) train_labels = np.asarray([get_label(sample) for sample in train_samples]) valid_labels = np.asarray([get_label(sample) for sample in valid_samples]) model, valid_mse = None, None model = SVR(**_svm_parameters) valid_mse = model.train(train_matrix, train_labels, valid_matrix, valid_labels) model.save(savedir) model.destroy() print("Fold %2d: %.4f"%(i+1, valid_mse))
def hidden_route(): ''' Used to recieve calls from the server giving new data. ''' # Recieving the raw data: # string json output text = json.dumps(request.json, sort_keys=True, indent=4, separators=(',', ': ')) print 'hello' # with open('data/example.json') as f: # json.dump(f) # print text # Cleaning the data: # string json input -> pandas dataframe output input_data = json.loads(text) clean_data = preprocess(input_data) # Predicting on the data: # pandas dataframe input -> boolean output prediction = model.predict_proba(clean_data) # Appending the prediction and time recieved to the cleaned data: # pandas dataframe input -> dict output # return_data = clean_data.to_dict(orient='list') input_data['prediction'] = str(prediction[0][0]) # return_data = dict((k, str(v[0])) for k, v in return_data.iteritems()) input_data['time_received'] = time.time() # Writing the full data to the database: # dict input tab.insert(input_data) return ''
def main(): par_dir = 'recordings/smartphone/' rec_num = '1430177546499' wavfile = par_dir + rec_num + '/' + rec_num + '.wav' magfile = par_dir + rec_num + '/' + rec_num + 'Mag.csv' accelfile = par_dir + rec_num + '/' + rec_num + 'Accel.csv' truthfile = par_dir + rec_num + '/' + rec_num + 'Truth.csv' data, truth = preprocessing.preprocess(wavfile, magfile, accelfile, truthfile) print "Begin Training..." if new_model: pipe = linear_model.LogisticRegression(solver='lbfgs', verbose=1) else: with open("pipe.model", "rb") as f: pipe = pickle.load(f) pipe.fit(data, truth) f = open("pipe.model", "wb") pickle.dump(pipe.sparsify(), f) print "Training Score:" print pipe.score(data, truth)
5.0 / rng, 1.0] # All colors are scaled between 0 and 1 clr_indx = zip(thresh, colors) cmap = m.colors.LinearSegmentedColormap.from_list('custom', clr_indx, 256) if True: # Gordon requested we examine the fronts in gray scale mapping cmap = cm.get_cmap('gray') # overwrite the previos color mapping minv = 0.01 maxv = 0.2 cmap.set_bad('black') # Set the land to black (from masked) cmap.set_under('white') # Set the clouds to white. set_under # means set all the values under minval to # white. (clouds are -1 and the minval is > 0. # Roughly 0.011) cmap.set_over('white') # set_over does the opposite of set_under edgePts = pp.preprocess(data, cutoff) # This is the actual routine which # finds the 'front'. The edge pixels # around each voronoi region are the # approximated front masked.data[edgePts.tolist()] = 255 # Here we set all the edge pixels # to 255, or white thanks to the # cmap.set_over routine. plt.imshow(masked, cmap=cmap, vmin=minv, vmax=maxv) # create plot fig = plt.gcf() # get current figure fig.set_size_inches((2 * cols) / 100.0, (2 * rows) / 100.0) # set save # parameters plt.savefig(rsltsDir+f[:-3]+"_%f.png" % (c), dpi=100) # By combining set_size and the dpi in savefig, you have complete
print("Preprocessing.. ") news_samples = [sample for sample in news_samples if sample.word_count >= _min_word_count and sample.section in _section_filter] random.shuffle(news_samples) n_samples = len(news_samples) train_samples = news_samples[0:int(n_samples*_train_ratio)] test_samples = news_samples[int(n_samples*_train_ratio):n_samples] print("Samples distribution:", preprocessing.samples_statistics(news_samples, _section_filter, get_section)) print("Train set distribution:", preprocessing.samples_statistics(train_samples, _section_filter, get_section)) print("Test set distribution:", preprocessing.samples_statistics(test_samples, _section_filter, get_section)) train_texts = [sample.text for sample in train_samples] test_texts = [sample.text for sample in test_samples] train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False, reduction = _reduction, reduce_n_attr = _reduce_n_attr, stem_words = _stem_words) print("Generating labels..") train_labels = preprocessing.samples_to_label(train_samples, _section_filter, get_section) test_labels = preprocessing.samples_to_label(test_samples, _section_filter, get_section) print("Training..") kmeans = KMeans(n_clusters = len(_section_filter)) reference_output = kmeans.fit_predict(train_matrix) # count[c, j]: for the cth cluster, how many texts belong to the jth section count = np.zeros((len(_section_filter), len(_section_filter))) for i in range(reference_output.shape[0]): c = reference_output[i] j = _section_filter.index(get_section(train_samples[i])) count[c, j] += 1
""" Created on Fri Feb 19 15:16:33 2016 @author: charles """ #LinearRegression import pandas as pd import preprocessing as pp import numpy as np import sklearn.linear_model #%% # this takes roughly 1 minute X_train, Y_train, Date_train, Assignment_train = pp.preprocess(nrows=4000000) #%% X_train, X_valid, Y_train, Y_valid = crossvalidate(X_train,Y_train,0.2) #%% model = sklearn.linear_model.LinearRegression() model.fit(X_train,Y_train) #%% Y_test = model.predict(X_valid) Y_train_pred = model.predict(X_train)
return sample.question samples = preprocessing.tp_sample.get_samples(_sample_folder) samples = [s for s in samples if s.batch_name == _batch_name and s.question is not None] random.shuffle(samples) n_samples = len(samples) train_samples = samples[0:int(n_samples*_train_ratio)] test_samples = samples[int(n_samples*_train_ratio):n_samples] print("Samples distribution:", preprocessing.samples_statistics(samples, _classes, get_question)) print("Train set distribution:", preprocessing.samples_statistics(train_samples, _classes, get_question)) print("Test set distribution:", preprocessing.samples_statistics(test_samples, _classes, get_question)) train_texts = [sample.text for sample in train_samples] test_texts = [sample.text for sample in test_samples] train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False) if _model == "SVM": train_labels = preprocessing.samples_to_label(train_samples, _classes, get_question) test_labels = preprocessing.samples_to_label(test_samples, _classes, get_question) model = SVM() model.train(train_matrix, train_labels) predict = model.predict(test_matrix) elif _model == "NN": train_dists = preprocessing.samples_to_dists(train_samples, _classes, get_question) test_dists = preprocessing.samples_to_dists(test_samples, _classes, get_question) model = Neural_Network(_n_factors = train_matrix.shape[1], _learning_rate = _learning_rate, _hidden_nodes = _hidden_nodes, _last_layer = len(_classes)) model.train(train_matrix, train_dists, test_matrix, test_dists) predict = model.predict(test_matrix)
def evaluate_classifier (numTrainR, numTrainN, numTestR, numTestN, model, verbose): ''' I used code from http://www.nltk.org/book/ch06.html for this ''' #load raw tweets: rawRacistTweets = loadRacistTweets(numTweets = numTrainR + numTestR, excludeJokes=True) rawNormalTweets = loadNonRacistTweets(numTweets = numTrainN + numTestN) #rawTweets = rawRacistTweets + rawNormalTweets print("Number of racist tweets: {}.".format(len(rawRacistTweets))); print("Number of normal tweets: {}.".format(len(rawNormalTweets))); #split into train/test sets trainR = rawRacistTweets[0:numTrainR]; print(len(trainR)) testR = rawRacistTweets[numTrainR:numTrainR + numTestR]; print(len(testR)) trainN = rawNormalTweets[0:numTrainN]; print(len(trainN)) testN = rawNormalTweets[numTrainN:numTrainN + numTestN]; print(len(testN)) #combine racist/non-racist tweets into single train/test datasets trainTweets = trainR + trainN; testTweets = testR + testN; #pre-process tweets (i.e. remove certain words): preprocessedTrainTweets = [(preprocess(d), c) for (d, c) in trainTweets]; preprocessedTestTweets = [(preprocess(d), c) for (d, c) in testTweets]; featureExtractor = FeatureExtractor([FeatureExtractor.UNIGRAM, FeatureExtractor.BIGRAM]) #featureExtractor.train_TF_IDF(trainTweets) #compute training & testing features trainFeats = [(featureExtractor.get_feature_vector(d), c) for (d,c) in preprocessedTrainTweets]; testFeats = [(featureExtractor.get_feature_vector(d), c) for (d,c) in preprocessedTestTweets]; if model == 'SVM': classifier = nltk.classify.SklearnClassifier(LinearSVC()); classifier.train(trainFeats); #evaluate SVM classifier print("----------------------"); print("SVM Classifier"); elif model == 'RF': rf = RF(n_estimators=75, max_features = 'sqrt', class_weight='auto', criterion="entropy", min_samples_split=9, random_state=0) classifier = nltk.classify.SklearnClassifier(rf); classifier.train(trainFeats); #evaluate RF classifier print("----------------------"); print("RF Classifier"); #note that TF-IDF cannot be set when model=NB elif model == 'NB': # Bayes classifier = nltk.NaiveBayesClassifier.train(trainFeats); print("----------------------"); print("NB Classifier"); print("accuracy: %.3f" %nltk.classify.accuracy(classifier, testFeats)); Y_test = [testFeat[1] for testFeat in testFeats] Y_pred = classifier.classify_many([testFeat[0] for testFeat in testFeats]) conf=metrics.confusion_matrix(Y_test, Y_pred, [0,1]) precision, recall, fscore = precision_recall_fscore(conf, 1) print("precision: %.3f" %precision) print("recall: %.3f" %recall) print("f1 score: %.3f" %fscore) print("%.1f\%% & %.1f\%% & %.1f\%%" %(100*precision,100*recall,100*fscore)) print("confusion matrix:") print(conf) if verbose: FP_indeces = np.where(np.subtract(Y_pred, Y_test)==1)[0] FN_indeces = np.where(np.subtract(Y_pred, Y_test)==-1)[0] for FP_index in FP_indeces: print("False positive: {}".format(' '.join(testTweets[FP_index][0]))) for FN_index in FN_indeces: print("False negative: {}".format(' '.join(testTweets[FN_index][0])))
def main(): # Read the data from the text files begin = time.time() vocab, train_raw, test_raw = read.read_tweets("../training_set_tweets.txt", "../test_set_tweets.txt") print "Num of Train users:", len(train_raw), "Num of Test users:", len(test_raw) print "Read data:", time.time() - begin # Preprocess the data begin = time.time() vocab, bigrams, train_word, test_word, train_char, test_char = preprocessing.preprocess(train_raw, test_raw) print "Preprocessed the data", time.time() - begin return # Assign ids to words vocab_list = list(vocab) vocab_list.sort() begin = time.time() vocab_dict = {} for i in range(len(vocab_list)): vocab_dict[vocab_list[i]] = i print "Assigned ids to words:", time.time() - begin # Build train and test set num_full_feats = len(vocab_list) + 10 num_train_tweets = 0 num_test_tweets = 0 # num_train_tweets = np.count_nonzero(~np.isnan(train)) # num_test_tweets = np.count_nonzero(~np.isnan(test)) for author_id in train: num_train_tweets += len(train[author_id]) for author_id in test: num_test_tweets += len(test[author_id]) X_train = np.zeros((num_train_tweets, num_full_feats)) y_train = np.zeros(num_train_tweets) X_test = np.zeros((num_test_tweets, num_full_feats)) y_test = np.zeros(num_test_tweets) # Build train and test set num_full_feats = len(vocab_list) + 10 num_train_tweets = 0 num_test_tweets = 0 # num_train_tweets = np.count_nonzero(~np.isnan(train)) # num_test_tweets = np.count_nonzero(~np.isnan(test)) for author_id in train_word: num_train_tweets += len(train_word[author_id]) for author_id in test_word: num_test_tweets += len(test_word[author_id]) X_train = np.zeros((num_train_tweets, num_full_feats)) y_train = np.zeros(num_train_tweets) X_test = np.zeros((num_test_tweets, num_full_feats)) y_test = np.zeros(num_test_tweets) count = 0 for author_id in train_word: for tweet in train_word[author_id]: X_train[count, :] = features.get_full_feats(tweet, vocab_dict) y_train[count] = author_id count += 1 print count count = 0 for author_id in test_word: for tweet in test_word[author_id]: X_test[count, :] = features.get_full_feats(tweet, vocab_dict) y_test[count] = author_id count += 1 print count begin = time.time() feats = feature_selection.select_features(X_train, y_train, np.zeros(num_full_feats), 100, "dia") X_train = X_train[:, feats] X_test = X_test[:, feats] print "Features selected:", time.time() - begin begin = time.time() clf = model.train(X_train, y_train) acc, my_acc, preds, scores = model.test(clf, X_test, y_test) print 'time:', time.time()-begin, 'acc:', acc, 'my_acc:', my_acc print 'preds:', preds print 'scores:', scores print (preds == y_test)[:100] print np.count_nonzero(scores > 0) print np.count_nonzero(scores < 0)
_sections = list(_sections.keys()) print("Grouped sections:", _sections) for sample in news_samples: sample.section = _section_group_map[sample.section] train_samples = news_samples[0:int(n_samples*_train_ratio)] test_samples = news_samples[int(n_samples*_train_ratio):n_samples] print("Samples distribution:", preprocessing.samples_statistics(news_samples, _sections, get_section)) print("Train set distribution:", preprocessing.samples_statistics(train_samples, _sections, get_section)) print("Test set distribution:", preprocessing.samples_statistics(test_samples, _sections, get_section)) train_texts = [sample.text for sample in train_samples] test_texts = [sample.text for sample in test_samples] train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, selection = "tfidf", select_top = _textbook_words, savedir = _save_dir, words_src = "textbook", normalize_flag = False, reduction = _reduction, reduce_n_attr = _reduce_n_attr, stem_words = _stem_words) for section in _sections: train_labels = preprocessing.samples_to_binary(train_samples, [section], get_section) test_labels = preprocessing.samples_to_binary(test_samples, [section], get_section) model = SVR() print("Training for %s section.. "%section) model.train(train_matrix, train_labels) predict = model.predict(test_matrix) accuracy = 0 for i in range(predict.shape[0]): if predict[i] >= 0.5: predict[i] = 1 else:
_model = "./output/5" _name_filter = ["KK201617T1", "KK201617T2"] _words = [] _norm_dict = None pca_components = None model = None with open(_model+"/preprocess.json", "r") as f: preprocess_dict = json.load(f) _words = preprocess_dict["words"] if "norm_info" in preprocess_dict: _norm_dict = preprocess_dict["norm_info"] if preprocess_dict["pca"]: pca_components = np.load(_model+'/pca.npy') def get_label(sample): #return sample.think + sample.understand + sample.lang + sample.pres return sample.think + sample.understand samples = preprocessing.tp_sample.get_samples(_sample_folder) texts = [sample.comment for sample in samples if sample.batch_name in _name_filter] test_matrix, _, _ = preprocessing.preprocess(texts, words_src = _words) if pca_components is not None: test_matrix = np.matmul(test_matrix, pca_components.T) if _norm_dict is not None: test_matrix, _, _ = preprocessing.normalize(test_matrix, norm_info = _norm_dict) model = models.SVR.load(_model) result = model.predict(test_matrix) print([get_label(sample) for sample in samples]) print(result)
def runSyllableClassification(self, SyllPath = None, nTrain = 50, nTest = 20, cType = 2, useStoredPatts = True, useRawOutput = True, pattTimesteps = None, maxPauseLength = 3, dataPrepParams = {}, cLearningParams = {}): """ :Description: Function that learns conceptors for each syllable in self.Songs and tries to classify the sequence of syllable generated from repeating the songs several times :Parameters: 1. SyllPath: If useSyllRecog = True, this needs to be the full directory to the folder including the syllable data (default = None) 2. nTrain: number of training samples to be used for each syllable (default = 50) 3. nTest: number of test samples to be used for each syllable (default = 10) 4. cType: index that indicates from which conceptor to use the recognition results {0 = pos, 1 = neg, 2 = combined} (default = 2) 5. useStoredPatts: if True, run syllable classification on self.patterns, else create new sequence according to repetition times in pattTimesteps (default = True) 6. useRawOutput: if True, store evidences from chosen conceptore as patterns. If False, apply winner-takes-it-all classification on evidences (default = True) 7. pattTimesteps: list of scalars representing the lengths each song in self.Songs should be presented at test time (only necessary if useStoredPatts is False) 8. maxPauseLength: Maximal length of pauses to be added randomly after each song (default = 3) 8. dataPrepParams: dictionary of keyword arguments for data preprocessing if syllable recognition is to be used (defaults of preprocessing function will be used if not specified) 9. cLearningParams: dictionary of keyword arguments to learn a conceptor for each syllable if syllable recognition is to be used (defaults of syllable classifier will be used if not specified) :Returns: 1. newPatts: List of patterns with recognition evidences for each syllable played """ if self.verbose: print('Running syllable recognition...') path = os.path.dirname(os.path.abspath(__file__)) if SyllPath is None else os.path.abspath(SyllPath) self.path = path # generate sequence of syllables from patterns to use syllableClassifier on syllClassPatts = np.zeros((1,self.nSylls)) # either use stored patterns if useStoredPatts: pattTimesteps = [] for p in self.patterns: syllClassPatts = np.append(syllClassPatts, p, axis = 0) pattTimesteps.append(len(p)) # or create sequences of lengths according to entries in pattTimesteps else: for i,t in enumerate(pattTimesteps): patt = self.patterns[i][0:len(self.Songs[i]),:] syllClassPatts = np.append(syllClassPatts, np.tile(patt, [round(t/len(self.Songs[i])),1]), axis = 0) syllClassPatts = syllClassPatts[1:,:] # if conceptors for syllables have not been learned already, learn them if not self.syllableConceptorsLearned: # get list with unique syllables and create preprocessed training and test data songs = [] for s in self.Songs: songs += s songs = set(songs) self.SyllClassData = preprocess(path, self.nSylls, nTrain, np.ones(self.nSylls) * nTest, syll_names = self.Sylls, **dataPrepParams) # initialize syllableClassifier and train it on training data self.SyllClass = syllableClassifier( cLearningParams['neurons'], cLearningParams['spectral_radius'], cLearningParams['bias_scale'], cLearningParams['inp_scale'], cLearningParams['conn']) self.SyllClass.cLearning(nTrain, self.SyllClassData['train_data'], cLearningParams['gammaPos'], cLearningParams['gammaNeg']) self.syllableConceptorsLearned = True # run classification on syllClassPatts and store the evidences for each presented syllable sampleIdx = [0,round(nTest/2)] results = self.SyllClass.cTest(self.SyllClassData['test_data'], pattern = syllClassPatts, sampleIdx = sampleIdx) evidences = results['evidences'][cType] if not useRawOutput: evidences_tmp = np.zeros_like(evidences) for syll in range(evidences.shape[0]): evidences_tmp[syll,np.argmax(evidences[syll,:])] = 1 evidences = evidences_tmp sampleIdx = [round(nTest/2),nTest-1] # create list with entries for each pattern and store the respective evidences in those entries t_all = 0 newPatts = [] for i,t in enumerate(pattTimesteps): patt = np.zeros((1,self.nSylls)) for j in range(round(t/len(self.Songs[i]))): pause_length = np.random.randint(maxPauseLength) patt_tmp = np.concatenate((evidences[t_all + j*len(self.Songs[i]) : t_all + (j+1)*len(self.Songs[i]),:], np.zeros((pause_length,self.nSylls))), axis = 0) patt = np.vstack((patt, patt_tmp)) pattTimesteps[i] += pause_length patt = patt[1:,:] newPatts.append(patt) t_all += (j+1)*len(self.Songs[i]) return newPatts
def main(): parser = argparse.ArgumentParser(description="Run QA-CLEF-System") parser.add_argument('--preprocess',action="store_true") parser.add_argument('--train',action="store_true") parser.add_argument('--answeronly',action='store_true') parser.add_argument('--selftest',action='store_true') parser.add_argument('--data',nargs = '+',default=[2011],type=int) parser.add_argument('--test',nargs = '+',default=[2012],type=int) parser.add_argument('--forcedownload',action='store_true') parser.add_argument('--preprocessonly',action='store_true') parser.add_argument('--ngram', type=int, default=3) parser.add_argument('--threshold', type=float, default=0.5) parser.add_argument('--report',action='store_true') args = parser.parse_args() process_args(args) data = [] for edition in args.data + args.test: _data = qacache.find_data(edition) if args.preprocess or _data is None: input_check([edition],args.forcedownload) _data = input_parse([edition]) print >> sys.stderr, 'preprocessing ' + str(edition) + '-data' _data = preprocessing.preprocess(_data) qacache.store_preprocessed_data(edition,_data[0]) else: print >> sys.stderr, str(edition) + '-data is found on cache/' + str(edition) + '-prerocessed.txt' data.append(_data) if args.preprocessonly: print >> sys.stderr, 'Preprocess-only task is done.' sys.exit(0) # build-model print >> sys.stderr, 'Building model...' training_model = model_builder.build_model(data[:len(args.data)]) test_model = model_builder.build_model(data[-len(args.test):]) if len(args.test) != 0 and not args.selftest else [] # scoring print >> sys.stderr, 'Unweighted Feature Scoring...' training_model and scoring.score(training_model) test_model and scoring.score(test_model) # training weight = qacache.stored_weight() if args.train or weight is None: print >> sys.stderr, 'Training...' weight = train(training_model) else: print >> sys.stderr, 'Weight is found on cache/weight.txt' # weighted_scoring print >> sys.stderr, 'Weighted Feature Scoring...' final = scoring.weighted_scoring(training_model if args.selftest else test_model, weight) # answer selection select_answer(final,args.threshold) # evaluation result = evaluate(final) qacache.write_json(final,'final.txt',indent=True) if args.report: report(final, args.test if not args.selftest else args.data,weight) print "Result: %f" % result
random.shuffle(news_samples) n_samples = len(news_samples) train_samples = news_samples[0:int(n_samples*_train_ratio)] test_samples = news_samples[int(n_samples*_train_ratio):n_samples] print("Samples distribution:", preprocessing.samples_statistics(news_samples, _sections, get_section)) print("Train set distribution:", preprocessing.samples_statistics(train_samples, _sections, get_section)) print("Test set distribution:", preprocessing.samples_statistics(test_samples, _sections, get_section)) train_texts = [sample.text for sample in train_samples] test_texts = [sample.text for sample in test_samples] tfidf_vectorizer = get_tfidfVectorizer_of_essay_top_tf_words() print("Vectorizer built..") train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, savedir = _save_dir, words_src = tfidf_vectorizer, normalize_flag = False, reduction = _reduction, reduce_n_attr = _reduce_n_attr, stem_words = _stem_words) model = None print("Generating labels..") if _model == "SVM": train_labels = preprocessing.samples_to_label(train_samples, _sections, get_section) test_labels = preprocessing.samples_to_label(test_samples, _sections, get_section) model = SVM() print("Training.. ") model.train(train_matrix, train_labels) predict = model.predict(test_matrix) elif _model == "NN": train_dists = preprocessing.samples_to_dists(train_samples, _sections, get_section) test_dists = preprocessing.samples_to_dists(test_samples, _sections, get_section) model = Neural_Network(_n_factors = train_matrix.shape[1], _learning_rate = _learning_rate, _hidden_nodes = _hidden_nodes, _last_layer = len(_sections))
def main(parameters): config = SafeConfigParser() config.read(parameters) ROOTDIR = config.get('filepaths', 'corpus') if len(os.listdir(ROOTDIR)) < 2: documents = [] with codecs.open(config.get('filepaths', 'basefile'), encoding='utf-8') as f: for (source, labels, text) in unicode_csv_reader(f): labels = clean_labels(labels) documents.append((source, labels, text)) for fold in k_fold_cross_validation(documents, 10): print fold fold, training, validation = filter_motifs(fold) with open(ROOTDIR + 'fold-%s.training.txt' % fold, 'w') as out: writer = csv.writer(out, quoting=csv.QUOTE_MINIMAL) for (source, motifs, text) in training: #motifs = [motif for motif in motifs is motif != 'DUMMY'] writer.writerow([source, ' '.join(motifs).encode('utf-8'), text.encode('utf-8')]) with open(ROOTDIR + 'fold-%s.validation.txt' % fold, 'w') as out: writer = csv.writer(out, quoting=csv.QUOTE_MINIMAL) for (source, motifs, text) in validation: writer.writerow([source, ' '.join(motifs).encode('utf-8'), text.encode('utf-8')]) training_preprocessor = lambda t: preprocess( t, encoding=config.get('preprocessing', 'encoding'), strip_accents = 'unicode' if config.getboolean('preprocessing', 'strip-accents') else None, strip_punct = config.getboolean('preprocessing', 'strip-punctuation'), lowercase = config.getboolean('preprocessing', 'lowercase'), max_df = config.getfloat('preprocessing', 'maximum-document-frequency'), min_df = config.getint('preprocessing', 'minimum-document-frequency'), min_word_len = config.getint('preprocessing', 'minimum-word-length'), join=False) validation_preprocessor = lambda t: preprocess( t, encoding=config.get('preprocessing', 'encoding'), strip_accents = 'unicode' if config.getboolean('preprocessing', 'strip-accents') else None, strip_punct = config.getboolean('preprocessing', 'strip-punctuation'), lowercase = config.getboolean('preprocessing', 'lowercase'), max_df = 1.0, min_df = 1.0, min_word_len = config.getint('preprocessing', 'minimum-word-length'), join=False) documents = defaultdict(list) for document in os.listdir(ROOTDIR): if not document.startswith('.') and document.startswith('fold'): documents[document.split('.')[0]].append(document) globalAP = [] globalMargin = [] globalOneError = [] globalIsError = [] system = config.get('system', 'system') if system == 'llda': system = llda elif system.upper() in ('SGD', 'SVC', 'KNN', 'NB'): system = classifier elif system == 'BM25': system = MR elif system == 'lldaTfidf': system = lldaTfidf else: raise ValueError("Unsupported system choice: %s" % system) for k, (fold, (training_docs, test_docs)) in enumerate(documents.iteritems()): assert 'training' in training_docs and 'validation' in test_docs training = list(cleanfile(ROOTDIR + training_docs, training_preprocessor, label_df=1)) validation = list(cleanfile(ROOTDIR + test_docs, validation_preprocessor, label_df=1)) isError, oneError, nDocs, margins, AP = system.run(training, validation, k, config) isError = isError / float(nDocs) oneError = oneError / float(nDocs) margins = sum(margins) / float(nDocs) AP = sum(AP) / float(nDocs) globalIsError.append(isError) globalOneError.append(oneError) globalMargin.append(margins) globalAP.append(AP) print 'Fold:', k print '-' * 80 print 'Num training docs:', len(training) print 'Num validation docs:', len(validation) print 'Average Precision:', AP print 'Is Error:', isError print 'One Error:', oneError print 'Margin:', margins print '-' * 80 print 'AVERAGE AP:', sum(globalAP) / len(globalAP) print 'AVERAGE ONE ERROR:', sum(globalOneError) / len(globalOneError) print 'AVERAGE IS ERROR:', sum(globalIsError) / len(globalIsError) print 'AVERAGE MARGIN:', sum(globalMargin) / len(globalMargin) output_dir = os.path.join('Data', sys.argv[-1]) with open(os.path.join(output_dir, 'output.txt'), 'w') as out: out.write('Average Precision: %f\n' % (sum(globalAP) / len(globalAP))) out.write('Average One Error: %f\n' % (sum(globalOneError) / len(globalOneError))) out.write('Average Is Error: %f\n' % (sum(globalIsError) / len(globalIsError))) out.write('Average Margin: %f\n' % (sum(globalMargin) / len(globalMargin)))
for item in tupleList : for word in item[0] : res.append(word); return res; ''' I used code from http://www.nltk.org/book/ch06.html for this ''' if __name__ == "__main__" : print("NB start"); racistTweets = [(preprocess(d), c) for (d, c) in loadRacistTweets(excludeJokes=True)]; normalTweets = [(preprocess(d), c) for (d, c) in loadNonRacistTweets(numTweets=len(racistTweets))]; print("Number of racist tweets: {}.".format(len(racistTweets))); print("Number of normal tweets: {}.".format(len(normalTweets))); numTrain = 1500; numTest = 500; trainR = racistTweets[0:numTrain]; testR = racistTweets[numTrain:numTrain + numTest]; trainN = normalTweets[0:numTrain]; testN = normalTweets[numTrain:numTrain + numTest];
def runDNN(path, syllN, trainN, cvalRuns, sampRate, interpolType, mfccN, invCoeffOrder, winsize, melFramesN, smoothL, polyOrder, incDer, snr = 0.0, syllNames = None, layerSizes = [60,10], activationFcts = 'tanh', dropouts = [], normalizations = [], optimizer = 'Adam', learningRate = 0.0005, batchSize = 10, nEpochs = 10, loss = 'CrossEntropyExclusiveSparse', validate_per_step = 100, samplingSDs = 0.05): """ Function that runs syllable classification in a supervised manner using positive, negative and combined conceptors. """ path = os.path.abspath(path) """ assign parameters """ prepParams = { 'syll_names': syllNames, 'sample_rate': sampRate, 'ds_type': interpolType, 'mel_channels': mfccN, 'inv_coefforder': invCoeffOrder, 'winsize': winsize, 'frames': melFramesN, 'smooth_length': smoothL, 'inc_der': incDer, 'poly_order': polyOrder, 'snr': snr } performances = [] evidences = [] for i in range(cvalRuns): n_test = np.ones(syllN, dtype = int)*20 Samples = [] if cvalRuns > 1: for j in range(syllN): indices = np.arange(0, trainN + n_test[j], 1) ind_tmp = indices.copy().tolist() random.shuffle(ind_tmp) ind_tmp = np.array(ind_tmp) Samples.append(ind_tmp) """ Get and preprocess data """ data = preprocessing.preprocess(path, syllN, trainN, n_test, samples = Samples, **prepParams) trainData = data['train_data'] testData = data['test_data'] inpDim = mfccN*(1+sum(incDer))*smoothL testL = int(n_test[0]/2) data_train = np.zeros((len(trainData)*trainN, inpDim)) labels_train = np.zeros(len(trainData)*trainN) data_test = np.zeros((len(testData)*testL, inpDim)) labels_test = np.zeros(len(testData)*testL) data_validate = np.zeros_like(data_test) labels_validate = np.zeros_like(labels_test) for t,syll in enumerate(trainData): for s,sample in enumerate(syll): data_train[t*trainN+s,:] = sample.flatten() labels_train[t*trainN+s] = t for t,syll in enumerate(testData): for s,sample in enumerate(syll): if s < n_test[0]/2: data_test[t*testL+s,:] = sample.flatten() labels_test[t*testL+s] = t else: data_validate[t*testL+s-testL,:] = sample.flatten() labels_validate[t*testL+s-testL] = t """ create DNN """ syllClassifier = DeepNN(inpDim, 1) if not any(dropouts): dropouts = np.zeros(len(layerSizes)) if not any(normalizations): normalizations = np.zeros(len(layerSizes)) if not type(samplingSDs) == np.ndarray: samplingSDs = np.zeros(len(layerSizes)) + samplingSDs for n,l in enumerate(layerSizes): if n == 0: if type(activationFcts) is not list: syllClassifier.addLayer(l, activationFcts, include_bias = True, normalization = normalizations[n], dropout = dropouts[n], sd = samplingSDs[n]) else: syllClassifier.addLayer(l, activationFcts[n], include_bias = True, normalization = normalizations[n], dropout = dropouts[n], sd = samplingSDs[n]) else: if type(activationFcts) is not list: syllClassifier.addLayer(l, activationFcts, include_bias = False, normalization = normalizations[n], dropout = dropouts[n], sd = samplingSDs[n]) else: syllClassifier.addLayer(l, activationFcts[n], include_bias = False, normalization = normalizations[n], dropout = dropouts[n], sd = samplingSDs[n]) """ train DNN and classify test data """ data = [data_train,data_validate] labels = [labels_train, labels_validate] syllClassifier.train(data, labels, loss_type = loss, optimizer_type = optimizer, learning_rate = learningRate, n_epochs = nEpochs, batch_size = batchSize, validate_per_step = validate_per_step, verbose = 20) syllClassifier.test(data_test, labels_test, normalize = True) results = syllClassifier.test_predictions performance = np.mean(np.argmax(results, axis = 1) == labels_test) evidences.append(results) performances.append(performance) cval_results = {'Evidences': evidences, 'Labels': labels_test, 'Performances': performances} return cval_results
# -*- coding: utf-8 -*- """ Created on Thu Feb 18 11:13:56 2016 @author: charles """ import pandas as pd import numpy as np import preprocessing as pp #%% # this takes roughly 1 minute X_train2D, Y_train2D, Date_train2D, Assignment_train2D = pp.preprocess() #%% X_train, Y_train, Category_train,nInputNumber, nSequenceLength, nInputDim, scalerY = pp.preprocessDeep() #%% from __future__ import print_function from keras.preprocessing import sequence from keras.models import Graph from keras.layers.core import TimeDistributedDense, Dropout, Masking from keras.layers.embeddings import Embedding from keras.layers.recurrent import LSTM def buildLSTM(nSequenceLength, nInputDim, nLSTMoutputDim = 100):