def test_preprocess_read():
    """Test preprocess.py read_file function"""
    assert read_file('students.csv')[0]['GPA'][0] == 4.7
    assert read_file('students.xlsx')[0]['GPA'][0] == 4.3
    assert read_file('students.csv')[1] == 'students'
    assert read_file('new_data/students.csv')[1] == 'students'
    assert read_file('students.xlsx')[1] == 'students'
def predict(input_path, output_path, resources_path):
    """
    This is the skeleton of the prediction function.
    The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
    with your predictions in the BIES format.
    
    The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.
    
    N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.

    :param input_path: the path of the input file to predict.
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :return: None
    """

    #load the training file with the sentences of all the four sub-dataset to make the vocab
    concat = pre.read_file(
        os.path.join(resources_path, "tensor_concat_train.utf8"))
    #create the vocab with the concatenation of all training files
    vocab_unigrams = pre.make_vocab(concat, 1)
    vocab_bigrams = pre.make_vocab(concat, 2)

    #load the test file from input
    test_tensor_lines = pre.read_file(input_path)

    #convert the input array into index
    test_x_uni = pre.word_to_index(test_tensor_lines, vocab_unigrams, 1)
    test_x_bi = pre.word_to_index(test_tensor_lines, vocab_bigrams, 2)

    #DEFINE SOME COSTANTS
    VOCAB_SIZE_UNI = len(vocab_unigrams)
    VOCAB_SIZE_BI = len(vocab_bigrams)
    CHAR_EMBEDDING_SIZE = 32
    BIGRAM_EMBEDDING_SIZE = [16, 32, 64]
    LEARNING_RATE = [0.04, 0.035, 0.03, 0.02, 0.009]
    HIDDEN_SIZE = 256
    INPUT_DROPOUT = [0, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6]
    LSTM_DROPOUT = [0, 0.1, 0.2, 0.3, 0.4]

    #BUILD THE MODEL
    model = md.create_keras_model_parallel(VOCAB_SIZE_UNI, VOCAB_SIZE_BI,
                                           CHAR_EMBEDDING_SIZE,
                                           BIGRAM_EMBEDDING_SIZE[1],
                                           HIDDEN_SIZE, INPUT_DROPOUT[2],
                                           LSTM_DROPOUT[2])
    print("Load the weights...")
    #load the better weights
    model.load_weights(os.path.join(resources_path, "weights.hdf5"))
    print("Predict...")
    #calculate the result from prediction
    result = result_from_prediction(model, test_x_uni, test_x_bi)
    print("Save the result on ", output_path)
    #create the output file with result from prediction
    pre.create_file(output_path, result)
    print("Done!")
Пример #3
0
def main():
    if len(sys.argv) != 3:
        print "Requires two arguments: a configuration file and a file containing json with the necessary information to add a peer."
        exit(1)
    conf = read_file(sys.argv[1])
    info = read_file(sys.argv[2])

    conf["authorizedPasswords"].append({"password": info["password"]})
    conf["interfaces"]['UDPInterface'][0]['connectTo'][info["ip"] + ":" + info["port"]] = {"password":info["password"], "publicKey":info["publicKey"]}
    with open(sys.argv[1],"w") as out:
        out.write(json.dumps(conf, sort_keys=True, indent=4))
Пример #4
0
def test_loader(index):
    with open('name2idx.json', 'r') as fp:
        name2idx = json.load(fp)
    root = ClassNode('root')

    body = read_file('dataset/test/%d.txt' % index).body
    if not body:
        return None
        # raise ValueError('Invalid html file.')
    find_child(body, root)
    path_list = get_all_paths(root)
    output = []

    for j in range(len(path_list)):
        # print('Processing %d-%d...' % (index, j))
        is_looping = True
        for k in itertools.product(*path_list[j][1:]):
            idx_list = []
            for i in k:
                try:
                    idx_list.append(name2idx[i])
                except KeyError:
                    print('Class names out of scope: ', i)
                    is_looping = False
                    break
            if is_looping:
                output.append(torch.tensor(idx_list).unsqueeze(0))
            else:
                break
    return output
Пример #5
0
def get_data():
    df = preprocess.read_file("ratings.dat", sep="::")
    rows = len(df)
    #Integer location based indexing for selection by position
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)

    #Separate data: 90% Train, 10% Test -- think about validation set later
    split_index = int(rows * 0.9)
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)

    return df_train, df_test
Пример #6
0
def main():
    if len(sys.argv) != 3:
        print "Requires two arguments: a configuration file and your IP address."
        exit(1)
    conf = read_file(sys.argv[1])
    ip = sys.argv[2]
    public_key = conf["publicKey"]
    password = conf["authorizedPasswords"][0]['password']
    bind = conf["interfaces"]['UDPInterface'][0]['bind']
    port = bind[bind.find(':') + 1:]
    json_dict = {"password":password, "port":port, "publicKey":public_key, "ip":ip}
    print json.dumps(json_dict)
Пример #7
0
def process_labels():
    base_path = "/data1/xiuwen/twitter/tweet2020/tweet-without-conversation/"
    train_tag_path = base_path + "train_repeat_tag.txt"
    train_tag = read_file(train_tag_path)
    train_tag_processed = [''.join(i.split(' ')) for i in train_tag]
    tag_set = set(train_tag_processed)
    num = len(tag_set)
    tag_set_list = list(tag_set)
    index = range(len(tag_set_list))
    index_tag_mapping = dict(zip(index, tag_set_list))
    tag_index_mapping = dict(zip(tag_set_list, index))
    encoding_of_tags = np.zeros((len(train_tag), num))
    for i in range(len(train_tag_processed)):
        encoding_of_tags[i, tag_index_mapping[train_tag_processed[i]]] = 1
    return encoding_of_tags, index_tag_mapping
def start(path: str,
          database: str = "mydatabase",
          user: str = "postgres",
          password: str = "12345",
          host: str = "127.0.0.1",
          port: str = "5432",
          n: int = 0) -> None:
    """
    Gets the name of the file with path to it and optional parameters
    The body of service
    Creates psql connection and database
    Then reads .csv or .xlsx file, gets column names and types from it
    Then adds data if the table with such name already exists
    Creates the table and adds the data inside if the table with such name
    doesn't exist
    :param path: the name of the file with path to it
    :param database: name of the database
    :param user: name of psql user
    :param password: password of psql user
    :param host: host
    :param port: port
    :param n: number of row with headers
    """
    register_adapter(np.int64, psycopg2._psycopg.AsIs)

    connection = create_connection("postgres", user, password, host, port)
    create_database_query = "CREATE DATABASE " + database
    create_database(connection, create_database_query)
    connection = create_connection(database, user, password, host, port)

    table, table_name = read_file(path, n)

    cursor = connection.cursor()
    cursor.execute(
        "select * from information_schema.tables where table_name=%s",
        (table_name, ))
    columns, data, types = preprocess(table)

    if bool(cursor.rowcount):
        insert(columns, data, table_name, connection)
        connection.commit()
    else:
        create_table(types, table_name, connection)
        insert(columns, data, table_name, connection)
        connection.commit()
Пример #9
0
    print('Usage: python %s <action>' % argv[0])
    print('\t where action is one of %r' % _ACTIONS)
    exit(1)


if __name__ == '__main__':
    argv = sys.argv
    if len(argv) != 2:
        _usage(argv)
    action = argv[1]
    if action not in _ACTIONS:
        _usage(argv)
    if action == 'preprocess':
        texts = []
        texts += read_pap(_PAP_FILENAME)
        texts += read_file(_POTOP_FILENAME)
        build_prepositions_map(texts, _PREPOSTITIONS_MAP)
    if action == 'prepositions':
        with open(_PREPOSTITIONS_MAP, 'rb') as f:
            prepositions_map = pickle.loads(f.read())
        stats, samples = prepositions_map_to_stats(prepositions_map)


        def samples_for_case(preposition, case):
            all_samples_for_case = samples[preposition][case]
            if len(all_samples_for_case) <= 5:
                return ', '.join(all_samples_for_case)
            else:
                return ', '.join(all_samples_for_case[:5] + ['...'])

Пример #10
0
import preprocess
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# Reading the data from file
file_train_data = preprocess.read_file("training.csv")
file_test_data = preprocess.read_file("test.csv")

# get rid of first line
file_train_data = file_train_data[1:]
file_test_data = file_test_data[1:]

# must do
preprocess.clean_data_isalpha(file_train_data)
preprocess.clean_data_isalpha(file_test_data)

######### Balance the data set #########
#tag, info_dic_train = preprocess.clean_data_helper_get_info(file_train_data)
#file_train_data = preprocess.clean_data_balance(file_train_data, tag)
######### Balance the data set #########

# store the data into seprate array
article_number, text_data, article_topic = preprocess.process_data(
    file_train_data)
def test_preprocess_preprocess():
    """Test preprocess.py preprocess function"""
    assert preprocess(read_file('students.xlsx')[0])[2][1] == 'age integer'
    assert preprocess(read_file('students.xlsx')[0])[2][0] == 'name text'
    assert preprocess(
        read_file('students.xlsx')[0])[2][5] == 'GPA double precision'
Пример #12
0
def test_read_file(path, is_test, expect_output):
    output = read_file(path, is_test)
    assert expect_output == output[0][0]
Пример #13
0
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    seq_length = 40
    batch_size = 100
    embedding_size = 200
    attention_size = 200
    topic_num = 100
    dim_k = 100
    drop_rate = 0.75

    data_path = "/data1/xiuwen/twitter/tweet2020/tweet-without-conversation/"
    train_data = "train_repeat_post.txt"
    test_data = "test_post.txt"
    path_to_glove_file = "/data1/xiuwen/glove.twitter.27B.200d.txt"

    train_data = read_file(data_path + train_data)
    test_data = read_file(data_path + test_data)
    # train_label = read_file(data_path + "train_tag.txt")
    # test_label = read_file(data_path + "test_tag.txt")

    # vectorize words
    vectorizer = TextVectorization(max_tokens=30000,
                                   output_sequence_length=seq_length)
    text_ds = tf.data.Dataset.from_tensor_slices(train_data).batch(batch_size)
    vectorizer.adapt(text_ds)

    # get mapping from words to indices
    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))
    num_words = len(voc) + 2