def cosine_similarity(books_data, DF, tf_idf, total_vocab, total_vocab_size, k, query): final_dict = dict() D = zero_vector(tf_idf, total_vocab, books_data.shape[0], total_vocab_size) # print("Cosine Similarity") preprocessed_query = preprocess(query) tokens = word_tokenize(str(preprocessed_query)) # print("\nQuery:", query) print("") # print(tokens) d_cosines = [] query_vector = gen_vector(DF, tokens, books_data, total_vocab) for d in D: d_cosines.append(cosine_sim(query_vector, d)) out = np.array(d_cosines).argsort()[-k:][::-1] # print("") # print(out) for each in out: case = { each: { 'bookname': books_data['bookname'][each], 'author': books_data['author'][each], 'chapter': books_data['chapter'][each] } } final_dict.update(case)
def load(): print("Reading training data...") df = pd.read_csv(DATASET_PATH + TRAIN_FILE_NAME) print("Preprocessing training data...") question1, question2, labels, tokenizer = pp.preprocess(df, mode='train') return question1, question2, labels, tokenizer
def main(): if len(sys.argv) == 3: start = time.time() print('Preprocessing the training data..') train = load_data(sys.argv[1]) print(train.head(3)) train = preprocess_data.preprocess(train, train=True) train = balance_training_data.balance_data(train) print('Preprocessing the test data..') test = load_data(sys.argv[2]) print(test.head(3)) test = preprocess_data.preprocess(test, train=False) print('Building the model..') classification.two_step_classification(train, test) end = time.time() print("The whole thing took {0:.2f} minutes".format( (end - start) / 60)) else: print( "You need to provide two files: one set of training data and one set of test data." )
def cosine_similarity(books_data, DF, tf_idf, total_vocab, total_vocab_size, k, query): final_dict = dict() D = zero_vector(tf_idf, total_vocab, books_data.shape[0], total_vocab_size) preprocessed_query = preprocess(query) tokens = word_tokenize(str(preprocessed_query)) print("") d_cosines = [] query_vector = gen_vector(DF, tokens, books_data, total_vocab) for d in D: cosine_rate = cosine_sim(query_vector, d) if math.isnan(cosine_rate): d_cosines.append(0.0) else: d_cosines.append(cosine_rate) out = np.array(d_cosines).argsort()[-k:][::-1] cosine_val = sorted(d_cosines, reverse=True)[:k] # for each in out: # result= np.where(out == each) # case = {each: {'bookname':books_data['bookname'][each], 'author':books_data['author'][each], 'chapter':books_data['chapter'][each], 'similarity':cosine_val[result[0][0]]}} # final_dict.update(case) for each in out: result = np.where(out == each) if cosine_val[result[0][0]] == 0.0: pass else: case = { each: { 'bookname': books_data['bookname'][each], 'author': books_data['author'][each], 'chapter': books_data['chapter'][each], 'similarity': cosine_val[result[0][0]] } } final_dict.update(case)
parser = argparse.ArgumentParser() parser.add_argument('-embedding', action='store', dest='embedding') fasttext_name = parser.parse_args().embedding embedding_dim = 100 learning_rate = 0.001145 bs = 256 drop = 0.2584 max_length = 1431 max_num_words = 23140 filters = [6] num_filters = 2426 nclasses = 451 x_train, y_train, x_val, y_val, embedding_matrix = preprocess( fasttext_name, embedding_dim, max_length, max_num_words) print("Starting Training ...") filter_sizes = [] for i in filters: filter_sizes.append(i) embedding_layer = Embedding(max_num_words, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=True) sequence_input = Input(shape=(max_length, ), dtype='uint16') embedded_sequences = embedding_layer(sequence_input)
batchSize = 64 nEpochs = 20 print('Training!') roc_auc = train_neural_network(model, batchSize, nEpochs, x_train, y_train, x_test=x_test, y_test=y_test) plot_gini_results(roc_auc.gini, roc_auc.gini_val) nn_output = model.predict(x_validation).flatten() export_csv_file("output.csv", validation_ids, nn_output) x_train_file_name, x_validate_file_name = 'Data/train.csv', 'Data/test.csv' x_train, y_train, x_validation, validation_ids = generate_features( x_train_file_name, x_validate_file_name) x_train = np.array(x_train).astype(np.float32) y_train = np.array(y_train).astype(np.int32) x_validation = np.array(x_validation).astype(np.float32) validation_ids = np.array(validation_ids).astype(np.int32) x_train, y_train, x_validation = preprocess(x_train, y_train, x_validation) puertoPredictions(x_train, y_train, x_validation, validation_ids)
threshold = 0.1 # Minimum Manhattan LSTM distance between two outputs # for them to be classified as semantically similar # Load trained model print("Loading model...") model = tf.keras.models.load_model( CHECKPOINT_PATH + MODEL_FILE_NAME, custom_objects={"manh_lstm_distance": manh_lstm_distance}) # Read test file print("Reading test data...") df = pd.read_csv(DATASET_PATH + TEST_FILE_NAME, skiprows=skiprows, nrows=nrows) # Preprocess test data print("Preprocessing test data...") question1, question2 = pp.preprocess(df, mode='predict') # Predict Manhattan LSTM distances print("Predicting Manhattan LSTM distances...") manh_lstm_distance = model.predict([question1, question2], verbose=1) # Make binary predictions print("Making binary predictions...") prediction = manh_lstm_distance > threshold prediction = prediction.astype(int) # Print predictions data = { 'Manhattan LSTM distances': list(manh_lstm_distance), 'Prediction': list(prediction) }
type=bool, default=False, help='Set to True if trying to reproduce results') parser.add_argument('--plots', type=bool, default=False, help='Set to True if wanting to generate the plots') parser.add_argument('--manual_eval', type=bool, default=True, help='Set to True to perform manual evaluation') args = parser.parse_args() df = pd.read_csv(args.input) data = df[(df.week == args.week)] data = preprocess(data) data.stems = [' '.join(text) for text in data.stems] print('This week has {0} articles.'.format(len(data))) vec_matrix_pca = tfidf_creation(data) k = elbow_plot(vec_matrix_pca) centroids, labels = clustering_kmeans(k, vec_matrix_pca, args.week, args.reproduce) data['labels'] = labels if args.plots: clusters_plot(centroids, labels, vec_matrix_pca, args.week)
args = parser.parse_args() config = config_file.config_preprocess[args.configurationID] # 1. Make new directory for the mel spectrograms config["melspectrogram_path"] = config['identifier'] + \ "/%s_mels/" % (config['identifier']) # set audio representations folder if not os.path.exists(config_file.DATA_PATH + config['melspectrogram_path']): os.makedirs(config_file.DATA_PATH + config['melspectrogram_path']) # 2. Find audio files to preprocess files_to_preprocess = [] f = open(config_file.DATA_PATH + config["index_file"]) for line in f.readlines(): file_id, audio = line.strip().split("\t") melspectrogram = audio[:audio.rfind(".")] + ".pk" # .npy or .pk # (id, path to audio file, path to mel spectrogram) files_to_preprocess.append( (file_id, config["audio_path"] + audio, config_file.DATA_PATH + config["melspectrogram_path"] + melspectrogram)) # 3. Compute mel spectrograms preprocess(files_to_preprocess, config) # 4. Save the parameters in a json json.dump( config, open( config_file.DATA_PATH + config['melspectrogram_path'] + "config.json", "w"))