def test_preprocess_read(): """Test preprocess.py read_file function""" assert read_file('students.csv')[0]['GPA'][0] == 4.7 assert read_file('students.xlsx')[0]['GPA'][0] == 4.3 assert read_file('students.csv')[1] == 'students' assert read_file('new_data/students.csv')[1] == 'students' assert read_file('students.xlsx')[1] == 'students'
def predict(input_path, output_path, resources_path): """ This is the skeleton of the prediction function. The predict function will build your model, load the weights from the checkpoint and write a new file (output_path) with your predictions in the BIES format. The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission. N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code. :param input_path: the path of the input file to predict. :param output_path: the path of the output file (where you save your predictions) :param resources_path: the path of the resources folder containing your model and stuff you might need. :return: None """ #load the training file with the sentences of all the four sub-dataset to make the vocab concat = pre.read_file( os.path.join(resources_path, "tensor_concat_train.utf8")) #create the vocab with the concatenation of all training files vocab_unigrams = pre.make_vocab(concat, 1) vocab_bigrams = pre.make_vocab(concat, 2) #load the test file from input test_tensor_lines = pre.read_file(input_path) #convert the input array into index test_x_uni = pre.word_to_index(test_tensor_lines, vocab_unigrams, 1) test_x_bi = pre.word_to_index(test_tensor_lines, vocab_bigrams, 2) #DEFINE SOME COSTANTS VOCAB_SIZE_UNI = len(vocab_unigrams) VOCAB_SIZE_BI = len(vocab_bigrams) CHAR_EMBEDDING_SIZE = 32 BIGRAM_EMBEDDING_SIZE = [16, 32, 64] LEARNING_RATE = [0.04, 0.035, 0.03, 0.02, 0.009] HIDDEN_SIZE = 256 INPUT_DROPOUT = [0, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6] LSTM_DROPOUT = [0, 0.1, 0.2, 0.3, 0.4] #BUILD THE MODEL model = md.create_keras_model_parallel(VOCAB_SIZE_UNI, VOCAB_SIZE_BI, CHAR_EMBEDDING_SIZE, BIGRAM_EMBEDDING_SIZE[1], HIDDEN_SIZE, INPUT_DROPOUT[2], LSTM_DROPOUT[2]) print("Load the weights...") #load the better weights model.load_weights(os.path.join(resources_path, "weights.hdf5")) print("Predict...") #calculate the result from prediction result = result_from_prediction(model, test_x_uni, test_x_bi) print("Save the result on ", output_path) #create the output file with result from prediction pre.create_file(output_path, result) print("Done!")
def main(): if len(sys.argv) != 3: print "Requires two arguments: a configuration file and a file containing json with the necessary information to add a peer." exit(1) conf = read_file(sys.argv[1]) info = read_file(sys.argv[2]) conf["authorizedPasswords"].append({"password": info["password"]}) conf["interfaces"]['UDPInterface'][0]['connectTo'][info["ip"] + ":" + info["port"]] = {"password":info["password"], "publicKey":info["publicKey"]} with open(sys.argv[1],"w") as out: out.write(json.dumps(conf, sort_keys=True, indent=4))
def test_loader(index): with open('name2idx.json', 'r') as fp: name2idx = json.load(fp) root = ClassNode('root') body = read_file('dataset/test/%d.txt' % index).body if not body: return None # raise ValueError('Invalid html file.') find_child(body, root) path_list = get_all_paths(root) output = [] for j in range(len(path_list)): # print('Processing %d-%d...' % (index, j)) is_looping = True for k in itertools.product(*path_list[j][1:]): idx_list = [] for i in k: try: idx_list.append(name2idx[i]) except KeyError: print('Class names out of scope: ', i) is_looping = False break if is_looping: output.append(torch.tensor(idx_list).unsqueeze(0)) else: break return output
def get_data(): df = preprocess.read_file("ratings.dat", sep="::") rows = len(df) #Integer location based indexing for selection by position df = df.iloc[np.random.permutation(rows)].reset_index(drop=True) #Separate data: 90% Train, 10% Test -- think about validation set later split_index = int(rows * 0.9) df_train = df[0:split_index] df_test = df[split_index:].reset_index(drop=True) return df_train, df_test
def main(): if len(sys.argv) != 3: print "Requires two arguments: a configuration file and your IP address." exit(1) conf = read_file(sys.argv[1]) ip = sys.argv[2] public_key = conf["publicKey"] password = conf["authorizedPasswords"][0]['password'] bind = conf["interfaces"]['UDPInterface'][0]['bind'] port = bind[bind.find(':') + 1:] json_dict = {"password":password, "port":port, "publicKey":public_key, "ip":ip} print json.dumps(json_dict)
def process_labels(): base_path = "/data1/xiuwen/twitter/tweet2020/tweet-without-conversation/" train_tag_path = base_path + "train_repeat_tag.txt" train_tag = read_file(train_tag_path) train_tag_processed = [''.join(i.split(' ')) for i in train_tag] tag_set = set(train_tag_processed) num = len(tag_set) tag_set_list = list(tag_set) index = range(len(tag_set_list)) index_tag_mapping = dict(zip(index, tag_set_list)) tag_index_mapping = dict(zip(tag_set_list, index)) encoding_of_tags = np.zeros((len(train_tag), num)) for i in range(len(train_tag_processed)): encoding_of_tags[i, tag_index_mapping[train_tag_processed[i]]] = 1 return encoding_of_tags, index_tag_mapping
def start(path: str, database: str = "mydatabase", user: str = "postgres", password: str = "12345", host: str = "127.0.0.1", port: str = "5432", n: int = 0) -> None: """ Gets the name of the file with path to it and optional parameters The body of service Creates psql connection and database Then reads .csv or .xlsx file, gets column names and types from it Then adds data if the table with such name already exists Creates the table and adds the data inside if the table with such name doesn't exist :param path: the name of the file with path to it :param database: name of the database :param user: name of psql user :param password: password of psql user :param host: host :param port: port :param n: number of row with headers """ register_adapter(np.int64, psycopg2._psycopg.AsIs) connection = create_connection("postgres", user, password, host, port) create_database_query = "CREATE DATABASE " + database create_database(connection, create_database_query) connection = create_connection(database, user, password, host, port) table, table_name = read_file(path, n) cursor = connection.cursor() cursor.execute( "select * from information_schema.tables where table_name=%s", (table_name, )) columns, data, types = preprocess(table) if bool(cursor.rowcount): insert(columns, data, table_name, connection) connection.commit() else: create_table(types, table_name, connection) insert(columns, data, table_name, connection) connection.commit()
print('Usage: python %s <action>' % argv[0]) print('\t where action is one of %r' % _ACTIONS) exit(1) if __name__ == '__main__': argv = sys.argv if len(argv) != 2: _usage(argv) action = argv[1] if action not in _ACTIONS: _usage(argv) if action == 'preprocess': texts = [] texts += read_pap(_PAP_FILENAME) texts += read_file(_POTOP_FILENAME) build_prepositions_map(texts, _PREPOSTITIONS_MAP) if action == 'prepositions': with open(_PREPOSTITIONS_MAP, 'rb') as f: prepositions_map = pickle.loads(f.read()) stats, samples = prepositions_map_to_stats(prepositions_map) def samples_for_case(preposition, case): all_samples_for_case = samples[preposition][case] if len(all_samples_for_case) <= 5: return ', '.join(all_samples_for_case) else: return ', '.join(all_samples_for_case[:5] + ['...'])
import preprocess import numpy as np import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer import time from collections import Counter from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split # Reading the data from file file_train_data = preprocess.read_file("training.csv") file_test_data = preprocess.read_file("test.csv") # get rid of first line file_train_data = file_train_data[1:] file_test_data = file_test_data[1:] # must do preprocess.clean_data_isalpha(file_train_data) preprocess.clean_data_isalpha(file_test_data) ######### Balance the data set ######### #tag, info_dic_train = preprocess.clean_data_helper_get_info(file_train_data) #file_train_data = preprocess.clean_data_balance(file_train_data, tag) ######### Balance the data set ######### # store the data into seprate array article_number, text_data, article_topic = preprocess.process_data( file_train_data)
def test_preprocess_preprocess(): """Test preprocess.py preprocess function""" assert preprocess(read_file('students.xlsx')[0])[2][1] == 'age integer' assert preprocess(read_file('students.xlsx')[0])[2][0] == 'name text' assert preprocess( read_file('students.xlsx')[0])[2][5] == 'GPA double precision'
def test_read_file(path, is_test, expect_output): output = read_file(path, is_test) assert expect_output == output[0][0]
os.environ['CUDA_VISIBLE_DEVICES'] = '0' seq_length = 40 batch_size = 100 embedding_size = 200 attention_size = 200 topic_num = 100 dim_k = 100 drop_rate = 0.75 data_path = "/data1/xiuwen/twitter/tweet2020/tweet-without-conversation/" train_data = "train_repeat_post.txt" test_data = "test_post.txt" path_to_glove_file = "/data1/xiuwen/glove.twitter.27B.200d.txt" train_data = read_file(data_path + train_data) test_data = read_file(data_path + test_data) # train_label = read_file(data_path + "train_tag.txt") # test_label = read_file(data_path + "test_tag.txt") # vectorize words vectorizer = TextVectorization(max_tokens=30000, output_sequence_length=seq_length) text_ds = tf.data.Dataset.from_tensor_slices(train_data).batch(batch_size) vectorizer.adapt(text_ds) # get mapping from words to indices voc = vectorizer.get_vocabulary() word_index = dict(zip(voc, range(len(voc)))) num_words = len(voc) + 2