예제 #1
0
def evaluate_multi(url, url2, time_lags=24):
    cr = Crawling()
    preds = utils.load_file(url)
    preds = np.array(preds)
    lt = len(preds)
    labels = utils.load_file(url2)
    labels = np.array(labels)

    loss_mae0, loss_mae1 = 0.0, 0.0
    loss_rmse0, loss_rmse1 = 0.0, 0.0
    r2_0, r2_1 = 0.0, 0.0
    for i, d in enumerate(preds):
        lb_i = i * pr.strides + time_lags + 1
        mae0, mse0, r2 = get_evaluation(d[:time_lags, :],
                                        labels[lb_i:(lb_i + time_lags), :, 0])
        # mae1, mse1 = get_evaluation(d[:time_lags,:,1], labels[lb_i:(lb_i+time_lags),:,1])
        loss_rmse0 += mse0
        # loss_rmse1 += mse1
        loss_mae0 += mae0
        # loss_mae1 += mae1
        r2_0 += r2
    loss_mae0 = loss_mae0 / lt * 300
    loss_mae1 = loss_mae1 / lt * 300
    loss_rmse0 = sqrt(loss_rmse0 / lt) * 300
    loss_rmse1 = sqrt(loss_rmse1 / lt) * 300
    r2_0 = r2_0 / lt
    print("MAE: %.6f %.6f" % (loss_mae0, cr.ConcPM25(loss_mae0)))
    print("RMSE: %.6f %.6f" % (loss_rmse0, cr.ConcPM25(loss_rmse0)))
    print("R2 Score: %.6f" % r2_0)
예제 #2
0
def execute_gan(path, attention_url, url_weight, model, session, saver, batch_size, encoder_length, decoder_length, is_test, train_writer=None, offset=0):
    #if restore and not is_test:
    #    tf.reset_default_graph()
    #    print(tf.get_default_graph())
        #with tf.device('/%s' % p.device):
        #    model.init_ops(not is_test)
        #    #model.add_placeholders()
        #trainable_vars = tf.trainable_variables()
        #saver = tf.train.Saver(trainable_vars)
    print("==> Loading dataset")
    dataset = utils.load_file(path)
    if dataset:
        dataset = np.asarray(dataset, dtype=np.float32)
        lt = len(dataset)
        train, _ = utils.process_data_grid(lt, batch_size, encoder_length, decoder_length, True)
   
    attention_data = None
    if attention_url:
        attention_data = utils.load_file(attention_url)
   
    model.set_data(dataset, train, None, attention_data)
        #with tf.Session(config=gpu_configs) as session:
            #init = tf.global_variables_initializer()
            #session.run(init)
    model.assign_datasets(session)
    if not is_test:   
        print("start training")
        for epoch in xrange(100):
             _ = model.run_epoch(session, train, offset + epoch, train_writer, train=True, verbose=False, stride=2)
        saver.save(session, 'weights/%s.weights' % url_weight)
    else:
         # saver.restore(session, url_weight)
        print('==> running model')
        _, preds = model.run_epoch(session, train, train=False, verbose=False, shuffle=False, stride=2)
        save_gan_preds(url_weight, preds)
예제 #3
0
def get_grammatical_data(train_filename,
                         test_filename,
                         dict_filename,
                         translate_emojis=True,
                         replace_slang=True,
                         lowercase=True):
    # Load the train and test sets
    print("Loading data...")
    train_tokens = utils.load_file(path + "/res/tokens/tokens_" +
                                   train_filename)
    train_pos = utils.load_file(path + "/res/pos/pos_" + train_filename)
    test_tokens = utils.load_file(path + "/res/tokens/tokens_" + test_filename)
    test_pos = utils.load_file(path + "/res/pos/pos_" + test_filename)

    if translate_emojis and replace_slang and lowercase:
        save_path = path + "/res/data/finest_grammatical_"
    else:
        save_path = path + "/res/data/grammatical_"

    # Clean the data and brind it to the most *grammatical* form possible
    gramm_train = grammatical_clean(train_tokens,
                                    train_pos,
                                    path + "/res/" + dict_filename,
                                    save_path + train_filename,
                                    translate_emojis=translate_emojis,
                                    replace_slang=replace_slang,
                                    lowercase=lowercase)
    gramm_test = grammatical_clean(test_tokens,
                                   test_pos,
                                   path + "/res/" + dict_filename,
                                   save_path + test_filename,
                                   translate_emojis=translate_emojis,
                                   replace_slang=replace_slang,
                                   lowercase=lowercase)
    return gramm_train, gramm_test
예제 #4
0
def get_traffic(**kwargs):
    global TRAFFIC_WRAPPER
    # t0 = time.time()
    if TRAFFIC_WRAPPER is None:
        wrapperFile = 'wrappers/traffic_wrapper.json'
        synonyms = load_synonyms('./datasets/sinonimos.csv')
        words = load_words()

        if os.path.isfile(wrapperFile):
            with open(wrapperFile,'r+') as rwjson:
                TRAFFIC_WRAPPER = ClassifierWrapper()
                TRAFFIC_WRAPPER.jsonLoads(rwjson.read())
                TRAFFIC_WRAPPER.dataset.dataset = list(load_file('./datasets/traffic2.csv'))
                TRAFFIC_WRAPPER.synonyms = copy.deepcopy(synonyms)
                TRAFFIC_WRAPPER.words = copy.deepcopy(words)
                TRAFFIC_WRAPPER.dataset.synonyms = copy.deepcopy(synonyms)
                TRAFFIC_WRAPPER.dataset.words = copy.deepcopy(words)
                return TRAFFIC_WRAPPER

        clf = kwargs.pop('clf', LogisticRegression(C=8.5))
        dataWrapperDataset = list(load_file('./datasets/traffic2.csv'))
        dataWrapper = DataWrapper(dataset=dataWrapperDataset,synonyms=copy.deepcopy(synonyms),words=copy.deepcopy(words))
        dataWrapper.resolveMatrix()

        wrapper = ClassifierWrapper(clf=clf,dataset=dataWrapper,synonyms=copy.deepcopy(synonyms),words=copy.deepcopy(words))
        cross_validate = kwargs.pop('cross_validate', True)
        if cross_validate:
            wrapper.cross_validate()
        wrapper.train()
        # print time.time() - t0, "seconds from the multiclass classifier"
        TRAFFIC_WRAPPER = wrapper
        with open(wrapperFile, 'w') as rw_json:
            json.dump(TRAFFIC_WRAPPER.toDict(), rw_json)
    return TRAFFIC_WRAPPER
예제 #5
0
def main(data_path):
    if path.exists(data_path + "/dict_char_en.pkl"):
        dict_char_en = utils.load_file(data_path + "/dict_char_en.pkl")
    else:
        dict_char_en = generateCharacterDict(properties.en_char)
        utils.save_file(data_path + "/dict_char_en.pkl", dict_char_en)
    if path.exists(data_path + "/dict_char_vi.pkl"):
        dict_char_vi = utils.load_file(data_path + "/dict_char_vi.pkl")
    else:
        dict_char_vi = generateCharacterDict(properties.vi_char)
        utils.save_file(data_path + "/dict_char_vi.pkl", dict_char_vi)
    if path.exists(data_path + "/dict_en.pkl"):
        dict_en = utils.load_file(data_path + "/dict_en.pkl", True)
    else:
        dict_en = build_dictionary(data_path, properties.vocab_en)
        utils.save_file(data_path + "/dict_en.pkl", dict_en, True)
    if path.exists(data_path + "/dict_vi.pkl"):
        dict_vi = utils.load_file(data_path + "/dict_vi.pkl", True)
    else:
        dict_vi = build_dictionary(data_path, properties.vocab_vi)
        utils.save_file(data_path + "/dict_vi.pkl", dict_vi, True)

    dataset_en, unknown_en = map_sentence_idx(
        data_path + "/" + properties.train_en, dict_en, dict_char_en)
    dataset_vi, unknown_vi = map_sentence_idx(
        data_path + "/" + properties.train_vi, dict_vi, dict_char_vi)
    utils.save_file(data_path + "/dataset_en.pkl", (dataset_en, unknown_en))
    utils.save_file(data_path + "/dataset_vi.pkl", (dataset_vi, unknown_vi))
예제 #6
0
def read_data(args):

    data_sources = []
    header = ''
    if (args.f != None):
        if not isinstance(args.f, basestring):
            parts = []
            for afile in args.f:
                part_of_data = utils.load_file(afile)

                if args.e != None and args.e == 'y':
                    if header == '':
                        header = part_of_data[0]
                    part_of_data = part_of_data[1:len(part_of_data)]
                    
                parts.append(part_of_data.tolist())
                if args.s != None and args.s == 'y':
                    parts.append("\n")

            parts = [item for sublist in parts for item in sublist]
            data_sources = array(parts)
        else:
            data_sources = utils.load_file(args.f)


    output = []
    if header != '':
        output.append(header)
    for item in data_sources:
        output.append(item)

    return output
예제 #7
0
def evaluate_transportation(url, url2, pred_length=8):
    preds = utils.load_file(url)
    preds = np.array(preds)
    lt = len(preds)
    labels = utils.load_file(url2)
    labels = np.array(labels)
    labels = labels.reshape(len(labels), 32, 32)
    shape = np.shape(preds)
    if preds.shape[-1] < pred_length:
        print("data shape is ", preds.shape)
        pred_length = preds[-1]
    loss_mae0 = [0.0] * pred_length
    loss_rmse0 = [0.0] * pred_length
    r2_total = 0.0
    for i, d in enumerate(preds):
        # 8 is encoder_length
        lb_i = i + 8
        # labels[lb_i:(pred_length+lb_i),:,:]
        for x in xrange(pred_length):
            mae0, mse0, _ = get_evaluation(d[x, :, :], labels[lb_i + x, :, :])
            # mae0, mse0, r2 = get_evaluation(d[0,:,:], labels[lb_i,:,:])
            loss_rmse0[x] += mse0
            loss_mae0[x] += mae0
            # r2_total += r2
    loss_mae0 = [(x / lt * 131) for x in loss_mae0]
    loss_rmse0 = [(sqrt(x / lt) * 131) for x in loss_rmse0]
    # r2_total = r2_total / lt
    # print("MAE: %.6f" % loss_mae0)
    # print("RMSE: %.6f" % loss_rmse0)
    # print("R2 Score: %.6f" % r2_total)
    print_accumulate_error(loss_mae0, loss_rmse0, pred_length, 0)
예제 #8
0
def get_data(vocabs=""):
    print("==> Load Word Embedding")
    word_embedding = utils.load_glove(use_index=True)

    validation_data = []
    training_data = []
    if not vocabs:
        non_words = utils.load_file(p.non_word, False)
        for w in non_words:
            w_ = w.replace('\n', '').split(' ')
            validation_data.append(int(w_[-1]))
        training_data = utils.sub(range(len(word_embedding)), validation_data)
    else:
        vocabs_set = utils.load_file(vocabs)
        print("vc", len(vocabs_set))
        training_data = [w for _, w in vocabs_set.iteritems()]
        tm = range(len(word_embedding))
        validation_data = list(utils.sub(set(tm), set(training_data)))
        length = int(math.ceil(len(training_data) * 1.0 / p.compression_batch_size)) * p.compression_batch_size - len(training_data)
        print('before', 'vd', len(validation_data), 'td', len(training_data))
        if length:
            add_on = np.random.choice(validation_data, length)
            training_data += add_on.tolist()
            validation_data = utils.sub(set(validation_data), set(add_on))
        print('vd', len(validation_data), 'td', len(training_data))
    # utils.save_file(p.glove_path, training_data)
    return word_embedding, training_data, validation_data
예제 #9
0
def execute_gan(path, attention_url, label_path, url_weight, model, session, saver, batch_size, encoder_length, decoder_length, is_test, train_writer=None, offset=0, gpu_nums=1):
    print("==> Loading dataset")
    dataset = utils.load_file(path)
    if dataset:
        dataset = np.asarray(dataset, dtype=np.float32)
        lt = len(dataset)
        train, _ = utils.process_data_grid(lt, batch_size, encoder_length, decoder_length, True)
    attention_data = None
    if attention_url:
        attention_data = utils.load_file(attention_url)

    labels = None
    if label_path:
        labels = utils.load_file(label_path)
    model.set_data(dataset, train, None, attention_data, labels)
    model.assign_datasets(session)

    if not is_test:
        print('==> starting training')
        suffix = p.weight_saving_break
        for epoch in xrange(p.total_iteration):
            _ = model.run_epoch(session, train, offset + epoch, train_writer, train=True, verbose=False, stride=4)
            tmp_e = epoch + 1
            if tmp_e % 10 == 0:
                suffix = math.ceil(float(tmp_e) / p.weight_saving_break)
                # utils.update_progress((epoch + 1) * 1.0 / p.total_iteration)
                saver.save(session, 'weights/%s_%i.weights' % (url_weight, suffix))
        saver.save(session, 'weights/%s_%i.weights' % (url_weight, suffix))
    else:
        # saver.restore(session, url_weight)
        print('==> running model')
        _, preds = model.run_epoch(session, train, train=False, verbose=False, shuffle=False, stride=2)
        save_gan_preds(url_weight, preds)
예제 #10
0
def convert_vocab_to_text(vocabs):
    vocab_str = ""
    length = len(vocabs)
    i = 0
    vocab_idx = dict()
    vocab_lst = list()
    idx_file = '%s/%s' % (folder, 'vocabs_idx.pkl')
    if u.check_file(idx_file):
        vocab_idx = u.load_file(idx_file)
    else:
        for key, value in vocabs.iteritems():
            vocab_idx[value] = key
        u.save_file(idx_file, vocab_idx)
    lst_file = '%s/%s' % (folder, 'vocabs_list.pkl')
    if u.check_file(lst_file):
        vocab_lst = u.load_file(lst_file)
    else:
        for key in sorted(vocab_idx.iterkeys()):
            vocab_lst.append(vocab_idx[key])
        u.save_file(lst_file, vocab_lst)
    regex = RegexpTokenizer(r'\w+')
    for w in vocab_lst:
        words = regex.tokenize(w)
        if len(words) != 0:
            w_ = '_'.join(words)
            i += 1
            if i % 10000 == 0:
                print('Processed %i' % i)
                # break
            if i == length:
                vocab_str += '%s' % w_
            else:
                vocab_str += '%s\n' % w_
    return vocab_str
예제 #11
0
def read_data(args):

    data_sources = []
    header = ''
    if (args.f != None):
        if not isinstance(args.f, basestring):
            parts = []
            for afile in args.f:
                part_of_data = utils.load_file(afile)

                if args.e != None and args.e == 'y':
                    if header == '':
                        header = part_of_data[0]
                    part_of_data = part_of_data[1:len(part_of_data)]

                parts.append(part_of_data.tolist())
                if args.s != None and args.s == 'y':
                    parts.append("\n")

            parts = [item for sublist in parts for item in sublist]
            data_sources = array(parts)
        else:
            data_sources = utils.load_file(args.f)

    output = []
    if header != '':
        output.append(header)
    for item in data_sources:
        output.append(item)

    return output
예제 #12
0
def build_emoji_sentiment_dictionary():
    new_emoji_sentiment_filename = path + "/res/emoji/emoji_sentiment_dictionary.txt"
    if not os.path.exists(new_emoji_sentiment_filename):
        filename = path + "/res/emoji/emoji_sentiment_raw.txt"
        emojis = utils.load_file(filename)[1:]
        lines = []
        for line in emojis:
            line = line.split(",")
            emoji = line[0]
            occurences = line[2]
            negative = float(line[4]) / float(occurences)
            neutral = float(line[5]) / float(occurences)
            positive = float(line[6]) / float(occurences)
            description = line[7]
            lines.append(
                str(emoji) + "\t" + str(negative) + "\t" + str(neutral) +
                "\t" + str(positive) + "\t" + description.lower())
            utils.save_file(lines, new_emoji_sentiment_filename)
    emoji_sentiment_data = utils.load_file(new_emoji_sentiment_filename)
    emoji_sentiment_dict = {}
    for line in emoji_sentiment_data:
        line = line.split("\t")
        # Get emoji characteristics as a list [negative, neutral, positive, description]
        emoji_sentiment_dict[line[0]] = [line[1], line[2], line[3], line[4]]
    return emoji_sentiment_dict
예제 #13
0
def exe(word_vectors_file, vector_preloaded_path, train_path, dev_path,
        test_path, hsi, hso, maxlen, pep, fep, ppat, fpat, plr, flr, mix):
    global word_vectors, vocabs
    if os.path.exists(train_path) and os.path.exists(
            dev_path) and os.path.exists(test_path):
        train = utils.load_file(train_path)
        dev = utils.load_file(dev_path)
        test = utils.load_file(test_path)
    else:
        raise NotImplementedError()
    if word_vectors is None or vocabs is None:
        word_vectors, vocabs = utils.loadWordVectors(word_vectors_file,
                                                     vector_preloaded_path)
    if not maxlen:
        maxlen = properties.maxlen
    lstm = Model(word_vectors,
                 hidden_sizes=[hsi, hso],
                 epochs=pep,
                 patience=ppat,
                 learning_rate=plr)
    lstm_params = lstm.train(train, dev, test, maxlen)
    if mix is 'Y':
        combined = LSTM_CNN(word_vectors,
                            hidden_sizes=[hsi, hso],
                            epochs=fep,
                            lstm_params=lstm_params)
        combined.train(train, dev, test, maxlen)
예제 #14
0
def evaluate_single_pred(url, url2, decoder_length=8):
    cr = Crawling()
    data = utils.load_file(url)
    if type(data) is list:
        data = np.asarray(data)
    lt = data.shape[0] * data.shape[1]
    data = np.reshape(data, (lt, 25))
    dtl = len(data)
    labels = utils.load_file(url2)
    labels = np.asarray(labels)
    loss_mae = 0.0
    loss_rmse = 0.0
    r2_total = 0.0
    for i, d in enumerate(data):
        pred_t = np.asarray(d).flatten()
        lb_i = i * pr.strides + 24
        lbt = labels[lb_i:(lb_i + decoder_length), :, 0]
        lbg = lbt[decoder_length - 1, :].flatten()
        mae, mse, r2 = get_evaluation(pred_t, lbg)
        loss_mae += mae
        loss_rmse += mse
        r2_total += r2
        utils.update_progress((i + 1.0) / dtl)
    loss_mae = loss_mae / lt * 300
    loss_rmse = sqrt(loss_rmse / lt) * 300
    r2_total = r2_total / lt
    print("MAE: %.6f %.6f" % (loss_mae, cr.ConcPM25(loss_mae)))
    print("RMSE: %.6f %.6f" % (loss_rmse, cr.ConcPM25(loss_rmse)))
    print("R2 score: %.6f" % r2_total)
예제 #15
0
def prepare_data(shuffle=False, labels_to_categorical=True):
    path = os.getcwd()[:os.getcwd().rfind("/")]
    to_write_filename = path + "/stats/data_prep_for_lstm_visualization.txt"
    utils.initialize_writer(to_write_filename)

    train_filename = "train.txt"
    test_filename = "test.txt"
    tokens_filename = "clean_original_"  # other types of tokens to experiment with in /res/tokens/
    data_path = path + "/res/tokens/tokens_"

    # Load the data
    train_data = utils.load_file(data_path + tokens_filename + train_filename)
    test_data = utils.load_file(data_path + tokens_filename + test_filename)

    if shuffle:
        train_data = utils.shuffle_words(train_data)
        test_data = utils.shuffle_words(test_data)
        print("DATA IS SHUFFLED")

    # Load the labels
    train_labels = [
        int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" +
                                        train_filename)
    ]
    test_labels = [
        int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" +
                                        test_filename)
    ]

    # Get the max length of the train tweets
    max_tweet_length = utils.get_max_len_info(train_data)

    # Convert all tweets into sequences of word indices
    tokenizer, train_indices, test_indices = utils.encode_text_as_word_indexes(
        train_data, test_data, lower=True)
    vocab_size = len(tokenizer.word_counts) + 1
    word_to_index = tokenizer.word_index
    print("There are %s unique tokens." % len(word_to_index))

    # Pad sequences with 0s (can do it post or pre - post works better here)
    x_train = pad_sequences(train_indices,
                            maxlen=max_tweet_length,
                            padding="post",
                            truncating="post",
                            value=0.)
    x_test = pad_sequences(test_indices,
                           maxlen=max_tweet_length,
                           padding="post",
                           truncating="post",
                           value=0.)

    # Transform the output into categorical data or just keep it as it is (in a numpy array)
    if labels_to_categorical:
        train_labels = to_categorical(np.asarray(train_labels))
        test_labels = to_categorical(np.asarray(test_labels))
    else:
        train_labels = np.array(train_labels)
        test_labels = np.array(test_labels)
    return x_train, train_labels, x_test, test_labels, vocab_size, tokenizer, max_tweet_length
예제 #16
0
def evaluate_sp(url, url2, decoder_length=24, is_grid=True, grid_eval=True):
    cr = Crawling()
    map_ = heatmap.build_map()
    data = utils.load_file(url)
    if type(data) is list:
        data = np.asarray(data)
    if len(data.shape) == 4:
        lt = data.shape[0] * data.shape[1]
    else:
        lt = data.shape[0]
    if is_grid:
        data = np.reshape(data, (lt, data.shape[-2], 25, 25))
    else:
        data = np.reshape(data, (lt, data.shape[-2], 25))
    labels = utils.load_file(url2)
    labels = np.asarray(labels)
    loss_mae = 0.0
    loss_rmse = 0.0
    r2_total = 0.0
    for i, d in enumerate(data):
        d = d[:decoder_length, :, :]
        pred_t = []
        if is_grid:
            for d_ in d:
                d_t = heatmap.clear_interpolate_bound(np.asarray(d_), map_)
                pred_t.append(d_t)
        else:
            if grid_eval:
                for d_ in d:
                    d_t = heatmap.fill_map(d_, map_)
                    pred_t.append(d_t)
            else:
                pred_t = d
        lb_i = i * pr.strides + 24
        lbt = labels[lb_i:(lb_i + decoder_length), :, 0]
        if grid_eval:
            lbg = []
            for x in lbt:
                x_l = heatmap.fill_map(x, map_)
                lbg.append(x_l)
            lbg = np.asarray(lbg)
            lbg = lbg.flatten()
        else:
            lbg = lbt.flatten()
        pred_t = np.asarray(pred_t)
        pred_t = pred_t.flatten()
        mae, mse, r2 = get_evaluation(pred_t, lbg)
        loss_mae += mae
        loss_rmse += mse
        r2_total += r2
        utils.update_progress((i + 1.0) / lt)
    loss_mae = loss_mae / lt * 300
    loss_rmse = sqrt(loss_rmse / lt) * 300
    r2_total = r2_total / lt
    print("MAE: %.6f %.6f" % (loss_mae, cr.ConcPM25(loss_mae)))
    print("RMSE: %.6f %.6f" % (loss_rmse, cr.ConcPM25(loss_rmse)))
    print("R2 Score: %.6f" % r2_total)
예제 #17
0
파일: eval_mc.py 프로젝트: doc-doc/NExT-QA
def accuracy_metric(sample_list_file, result_file):

    sample_list = load_file(sample_list_file)
    group = {
        'CW': [],
        'CH': [],
        'TN': [],
        'TC': [],
        'DC': [],
        'DL': [],
        'DO': []
    }
    for id, row in sample_list.iterrows():
        qns_id = str(row['video']) + '_' + str(row['qid'])
        qtype = str(row['type'])
        #(combine temporal qns of previous and next as 'TN')
        if qtype == 'TP': qtype = 'TN'
        group[qtype].append(qns_id)

    preds = load_file(result_file)
    group_acc = {'CW': 0, 'CH': 0, 'TN': 0, 'TC': 0, 'DC': 0, 'DL': 0, 'DO': 0}
    group_cnt = {'CW': 0, 'CH': 0, 'TN': 0, 'TC': 0, 'DC': 0, 'DL': 0, 'DO': 0}
    overall_acc = {'C': 0, 'T': 0, 'D': 0}
    overall_cnt = {'C': 0, 'T': 0, 'D': 0}
    all_acc = 0
    all_cnt = 0
    for qtype, qns_ids in group.items():
        cnt = 0
        acc = 0
        for qid in qns_ids:

            cnt += 1
            answer = preds[qid]['answer']
            pred = preds[qid]['prediction']

            if answer == pred:
                acc += 1

        group_cnt[qtype] = cnt
        group_acc[qtype] += acc
        overall_acc[qtype[0]] += acc
        overall_cnt[qtype[0]] += cnt
        all_acc += acc
        all_cnt += cnt

    for qtype, value in overall_acc.items():
        group_acc[qtype] = value
        group_cnt[qtype] = overall_cnt[qtype]

    for qtype in group_acc:
        print(map_name[qtype], end='\t')
    print('')
    for qtype, acc in group_acc.items():
        print('{:.2f}'.format(acc * 100.0 / group_cnt[qtype]), end='\t')
    print('')
    print('Acc: {:.2f}'.format(all_acc * 100.0 / all_cnt))
예제 #18
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    if args.doc:
        print __doc__
        sys.exit()

    g = geosearchclass.GeoSearchClass()

    if args.params:
        print 'Using parameters from ' + str(args.params)
        # turn parameter file into dictionary
        g.set_params_from_file(args.params)
        
    if args.address:
        print "Finding geocoordates for address:\n{}".format(args.address)
        coords = geo_converter.get_geocoords_from_address(args.address)
        if coords:
            g.latitude = coords[0]
            print "Found this latitude:"
            print g.latitude
            g.longitude = coords[1]
            print "Found this longitude:"
            print g.longitude
        else:
            print "Failed to find coordinates. Exiting."
            sys.exit()

    if args.input:
        text = utils.load_file(args.input)
        tokens = utils.tokenize_normal_words(text)
        for_poem = utils.filter_words(tokens)
    else:
        for_poem = get_default_words()

    if args.markov:
        if args.input:
            raise StandardError("Can only input a single text file. \
use --markov <your_text_file.txt>")
        else:
            text = utils.load_file(args.markov)
            # ngram = ngrams.make_ngram(text, 2)
            ngram = ngrams.make_bigram_trigram_dictionary(text)
            formatted_poem = create_poem(g, for_poem, ngram)
    else:
        formatted_poem = create_poem(g, for_poem)

    if args.output:
        print '\nwriting formatted poem to ' + str(args.output)
        output_file = args.output
    else:
        print "\nwriting formatted poem to poem.txt"
        output_file = "poem.txt"

    utils.save_file(output_file, formatted_poem)
예제 #19
0
 def load_trained_params(self):
     lstm = utils.load_file('lstm_cb.txt')
     hidden_lstm = utils.load_file('hidden_cb.txt')
     hidden_relu_lstm = utils.load_file('hidden_relu_cb.txt')
     full_connect_lstm = utils.load_file('full_connect_cb.txt')
     convs = list()
     for x in range(len(self.filter_sizes)):
         conv = utils.load_file('convolution_%s.txt' % x)
         convs.append(conv)
     return lstm, hidden_lstm, hidden_relu_lstm, full_connect_lstm, convs
예제 #20
0
def get_filtered_clean_data(train_filename, test_filename):
    # Loading the train and test sets
    print("Loading data...")
    train_tokens = utils.load_file(path + "/res/data/" + train_filename)
    test_tokens = utils.load_file(path + "/res/data/" + test_filename)
    filtered_train_tokens = ulterior_clean(
        train_tokens, path + "/res/data/filtered_" + train_filename)
    filtered_test_tokens = ulterior_clean(
        test_tokens, path + "/res/data/filtered_" + test_filename)
    return filtered_train_tokens, filtered_test_tokens
예제 #21
0
def evaluate_lstm(url,
                  url2,
                  decoder_length=24,
                  forecast_factor=0,
                  is_classify=False):
    data = utils.load_file(url)
    if type(data) is list:
        data = np.asarray(data)
    lt = data.shape[0] * data.shape[1]
    data = np.reshape(data, (lt, data.shape[-1]))
    if decoder_length > data.shape[-1]:
        decoder_length = data.shape[-1]
    dtl = len(data)
    labels = utils.load_file(url2)
    labels = np.asarray(labels)
    if not is_classify:
        loss_mae = [0.0] * decoder_length
        loss_rmse = [0.0] * decoder_length
    else:
        acc = 0.
    #: r2_total = 0.0
    cr = Crawling()
    for i, d in enumerate(data):
        if decoder_length < data.shape[-1]:
            pred_t = d[:decoder_length]
        else:
            pred_t = d
        lb_i = i * pr.strides + 24
        lbt = np.mean(labels[lb_i:(lb_i + decoder_length), :, forecast_factor],
                      axis=1)
        a = 0.
        for t_i, (p, l) in enumerate(zip(pred_t, lbt)):
            if not is_classify:
                # mae, mse, _ = get_evaluation(p, l)
                mae = abs(cr.ConcPM10(p * 300) - cr.ConcPM10(l * 300))
                loss_mae[t_i] += mae
                # loss_rmse[t_i] += mse
            else:
                a += classify_data(pred_t, lbt, forecast_factor)
        if is_classify:
            a = a / decoder_length
            acc += a
        # r2_total += r2
        utils.update_progress((i + 1.0) / dtl)
    if not is_classify:
        loss_mae = np.array(loss_mae) / lt
        # loss_rmse = [sqrt(x / lt)  * 300 for x in loss_rmse]
        # print("R2 score: %.6f" % r2_total)
        print_accumulate_error(loss_mae,
                               loss_rmse,
                               decoder_length,
                               forecast_factor=forecast_factor)
    else:
        acc = acc / lt * 100
        print("accuracy %.4f" % acc)
예제 #22
0
def get_strict_data(train_filename, test_filename):
    # Load the train and test sets
    print("Loading data...")
    train_tweets = utils.load_file(path + "/res/data/" + train_filename)
    test_tweets = utils.load_file(path + "/res/data/" + test_filename)

    # Initial clean of data
    strict_tweets_train = strict_clean(
        train_tweets, path + "/res/data/strict_" + train_filename)
    strict_tweets_test = strict_clean(
        test_tweets, path + "/res/data/strict_" + test_filename)
    return strict_tweets_train, strict_tweets_test
예제 #23
0
    def __init_docs(self, file_names):
        docs = []  # type:List[Document]
        for f_name in file_names:
            # 读取text文档
            text = load_file(self.root_path, f_name, "txt")  # type:str
            # 读取entities和ent_pairs
            ann_data = load_file(self.root_path, f_name, "ann")
            entities, entity_pairs = self.__get_entities_and_pairs(
                ann_data)  # type:NamedEntitySet,List[EntityPair]

            d = Document(f_name, self.root_path, text, entities, entity_pairs)
            docs.append(d)
            self._docs = docs
예제 #24
0
 def transform(self, X, **transform_params):
     docs_topics_vectors = []
     lda_model = load_file("models/LDAbow_fbpac.pickle")
     lda_dictionary = load_file("models/LDAdict_fbpac.pickle")
     for doc in X:
         try:
             bow_vector = lda_dictionary.doc2bow(pre_process(doc))
             docs_topics_vectors.append(lda_model[bow_vector])
         except Exception as e:
             print(e)
             print("Error in computing topic vector")
     n, nx, ny = np.array(docs_topics_vectors).shape
     d2_all_docs = np.array(docs_topics_vectors).reshape((n, nx * ny))
     return d2_all_docs[:, 1::2]
예제 #25
0
def main(urls, file=False):
    global loaded
    a_load = utils.load_file('cached.pkl')
    if a_load: 
        loaded = a_load
    else:
        loaded = dict()
    if file:
        urls = utils.load_file(urls)
        # bad = utils.load_file('sitemap_bad.txt')
    elif urls:
        urls = urls.split(',')
    scrape_list(urls)
    utils.save_file('cached.pkl', loaded)
예제 #26
0
def get_clean_dl_data(train_filename, test_filename, word_list):
    vocab_filename = "dnn_vocabulary_" + train_filename
    # Load the train and test sets
    print("Loading data...")
    train_tweets = utils.load_file(path + "/res/tokens/tokens_" +
                                   train_filename)
    test_tweets = utils.load_file(path + "/res/tokens/tokens_" + test_filename)
    vocabulary = build_vocabulary_for_dnn_tasks(
        path + "/res/vocabulary/" + vocab_filename, train_tweets)
    clean_train_tweets, train_indices = vocabulary_filtering(
        vocabulary, train_tweets)
    clean_test_tweets, test_indices = vocabulary_filtering(
        vocabulary, test_tweets)
    return clean_train_tweets, train_indices, clean_test_tweets, test_indices, len(
        vocabulary)
예제 #27
0
def convert_data_to_grid(url, out_url, url_att="", out_url_att="", part=1):
    grid = heatmap.build_map(pr.map_size)
    data = utils.load_file(url)
    lt = len(data)
    attention_data = None
    att_part = None
    print(url_att)
    if url_att:
        attention_data = utils.load_file(url_att)
        alt = len(attention_data)
        if lt != alt:
            raise ValueError(
                "Attention & Main Data need same length while %s and %s" %
                (lt, alt))
        data = zip(data, attention_data)
        att_part = []
    res = []
    if part != 1:
        bound = int(math.ceil(float(lt) / part))
    else:
        bound = lt
    for i, row in enumerate(data):
        if url_att:
            t, a = row
        else:
            t = row
        if i and (i % bound) == 0:
            p_i = i / bound
            out_url_name = out_url + "_" + str(p_i)
            utils.save_file(out_url_name, res)
            if url_att:
                att_out_url_name = out_url_att + "_" + str(p_i)
                utils.save_file(att_out_url_name, att_part)
            res = []
            att_part = []
        g = heatmap.fill_map(t, grid)
        res.append(g)
        if url_att:
            att_part.append(a)
        utils.update_progress(float(i) / lt)
    if part == 1:
        out_url_name = out_url
    else:
        out_url_name = out_url + "_" + str(part)
    utils.save_file(out_url_name, res)
    if url_att:
        att_out_url_name = out_url_att + "_" + str(part)
        utils.save_file(att_out_url_name, att_part)
def load_multi_data(args):

    header = []
    data = {}
    if (args.f != None):
        if not isinstance(args.f, basestring):

            for afile in args.f:
                file_lines = utils.load_file(afile)
                
                count = 0
                for line in file_lines:
                    org_line = line
                    line = line.rsplit(',')
                    if (len(line) <= 1):
                        line = org_line.rsplit("\t")

                    if (count == 0):
                        if (len(header) > 0):
                            header.append(', '.join(line[1:len(line)]))
                        else:
                            header.append(', '.join(line))
 
                        count = count + 1
                        continue

                    if (line[0] not in data):
                        data[line[0]] = []

                    data[line[0]].append(', '.join(line[1:len(line)]))

    return (header, data)
예제 #29
0
def pdb_to_lh5(traj, field):
    path = getattr(traj, field)
    data = load_file(path)
    new_fn = os.path.splitext(path)[0] + '.lh5'
    save_file(new_fn, data)
    os.unlink(path)
    setattr(traj, field, new_fn)
예제 #30
0
def read_data(args):

    data_sources = []
    header = ""
    if args.f != None:

        parts = []
        afile = args.f

        part_of_data = utils.load_file(afile)

        if args.e != None and args.e == "y":
            if header == "":
                header = part_of_data[0]
            part_of_data = part_of_data[1 : len(part_of_data)]

        part_of_data = calculat_hsv_model(part_of_data, args.t)

        parts.append(part_of_data)

        parts = [item for sublist in parts for item in sublist]
        data_sources = array(parts)

    output = []
    if header != "":
        output.append(header)
    for item in data_sources:
        output.append(item)

    return output
예제 #31
0
def main(args):
    # Load file content.
    content = load_file(args.input, encoding=args.encoding)
    # Clean content.
    cleaned = clean(content, args.pattern)
    # Save cleaned content.
    save_file(args.output, cleaned, encoding=args.encoding)
예제 #32
0
def split_hashtag_long_version(hashtag):
    word_file = path + "/res/word_list.txt"
    word_list = utils.load_file(word_file).split()
    word_dictionary = list(set(words.words()))
    for alphabet in "bcdefghjklmnopqrstuvwxyz":
        word_dictionary.remove(alphabet)
    all_poss = split_hashtag_to_words_all_possibilities(
        hashtag.lower(), word_dictionary)
    max_p = 0
    min_len = 1000
    found = False
    best_p = []
    for poss in all_poss:
        counter = 0
        for p in poss:
            if p in word_list:
                counter += 1
        if counter == len(poss) and min_len > counter:
            found = True
            min_len = counter
            best_p = poss
        else:
            if counter > max_p and not found:
                max_p = counter
                best_p = poss
    best_p_v2 = split_hashtag(hashtag, word_list)
    if best_p != [] and best_p_v2 != []:
        split_words = best_p if len(best_p) < len(best_p_v2) else best_p_v2
    else:
        if best_p == [] and best_p_v2 == []:
            split_words = [hashtag]
        else:
            split_words = best_p if best_p_v2 == [] else best_p_v2
    split_words = ['#' + str(s) for s in split_words]
    return split_words
예제 #33
0
def read_data(args):

    [operator, comparing_field_index, threshold] = args.t.rsplit(',')

    data_sources = []
    header = ''
    if (args.f != None):

        parts = []
        afile = args.f

        part_of_data = utils.load_file(afile)
        
        if args.e != None and args.e == 'y':
            if header == '':
                header = part_of_data[0]
            part_of_data = part_of_data[1:len(part_of_data)]
        
        
        part_of_data = filter_content(part_of_data, operator, comparing_field_index, threshold)
            
        parts.append(part_of_data)
        
        parts = [item for sublist in parts for item in sublist]
        data_sources = array(parts)

    output = []
    if header != '':
        output.append(header)
    for item in data_sources:
        output.append(item)

    return output
예제 #34
0
def read_data(args):

    data_sources = []
    header = ''
    if (args.f != None):

        parts = []
        afile = args.f

        part_of_data = utils.load_file(afile)
        
        if args.e != None and args.e == 'y':
            if header == '':
                header = part_of_data[0]
            part_of_data = part_of_data[1:len(part_of_data)]
        
        
        part_of_data = calculat_facedetection_model(part_of_data, args.t, args.c)
            
        parts.append(part_of_data)
        
        parts = [item for sublist in parts for item in sublist]
        data_sources = array(parts)

    output = []
    if header != '':
        output.append(header)
    for item in data_sources:
        output.append(item)

    return output
예제 #35
0
    def on_return(self, task):
        """Called by main thread on the return of data from the workers.
        Post-processing"""
        logger.info('Retrieved task %s', task.tag)
        traj = Session.query(models.Trajectory).get(int(task.tag))
        
        try:
            # save lh5 version of the trajectory
            conf = load_file(self.project.pdb_topology_file)
            coordinates = msmbuilder.Trajectory.load_trajectory_file(str(traj.dry_xtc_fn), Conf=conf)
            save_file(traj.lh5_fn, coordinates)
        
        except Exception as e:
            logger.error('When postprocessing %s, convert to lh5 failed!', traj)
            logger.exception(e)
            raise
        
        # convert last_wet_snapshot to lh5
        pdb_to_lh5(traj, 'last_wet_snapshot_fn')
        pdb_to_lh5(traj, 'init_pdb_fn')


        traj.host = task.host
        traj.returned_time = datetime.now()
        traj.length = len(coordinates)
        logger.info('Finished converting new traj to lh5 sucessfully')
예제 #36
0
    def on_open1_activate(self, widget=None, file=''):
        self.on_quit2_activate()# this takes care of saving content yes/no
        if not file:
            dlg = FileDialog(action='open',title=_("Open GvR world"),ext='wld')
            response = dlg.run()
            if response == Gtk.ResponseType.OK:
                file = dlg.get_filename()
                if os.path.splitext(file)[1] != '.wld':
                    self.show_error(_("Selected path is not a world file"))
                    dlg.destroy()
                    return

            elif response == Gtk.ResponseType.CANCEL:
                self.logger.debug('Closed, no files selected')
                dlg.destroy()
                return

            dlg.destroy()

        txt = utils.load_file(file)
        if txt:
            self.set_text(file,txt)
            self.parent.on_button_reload()

        return
예제 #37
0
    def on_open1_activate(self, widget=None,file=''):
        if not file:
            dlg = FileDialog(action='open',title=_("Open GvR program"),ext='gvr')
            response = dlg.run()
            if response == Gtk.ResponseType.OK:
                file = dlg.get_filename()
                if os.path.splitext(file)[1] != '.gvr':
                    self.show_error(_("Selected path is not a program file"))
                    dlg.destroy()
                    return

            elif response == Gtk.ResponseType.CANCEL:
                self.logger.debug('Closed, no files selected')
                dlg.destroy()
                return

            dlg.destroy()

        txt = utils.load_file(file)

        if txt:
            self.set_text(file,txt)

        for b in ('execute','step','abort'):
            self.parent._set_sensitive_button(b,True)

        return
def read_file(afile, fields, filters, data_sources, data_labels, padding = None):

    part_of_data = utils.load_file(afile)
    part_of_data = part_of_data[1:len(part_of_data)]

    content = {}
    count = 0
    for line in part_of_data:
        line_fields = line.rsplit(',')
        if (len(line_fields) == 1):
            line_fields = line.rsplit("\t")

        selected_line = ''

        if (len(filters) == 0 or data_labels[count] in filters):
            selected_line = [line_fields[int(index)] for index in fields]
        else:
            if padding != None:
                selected_line = padding
            else:
                selected_line = ['0' for index in fields]

        #content.append(','.join(selected_line))
        content[line_fields[0]] = ','.join(selected_line)

        count += 1

    output_content = []
    for item in data_sources:
        if (item in content):
            output_content.append(content[item]) 
        else:
            print("key " + item + " is not in input file.")
        
    return "\n".join(output_content)
def main():
    parser = argparse.ArgumentParser(description = 'Exporting data matrix from HIT summary result.')
    parser.add_argument('-f', action = 'append', help = 'The CSV files.')
    parser.add_argument('-c', help = 'The exporting columns separated with comma.')
    parser.add_argument('-o', help = 'The output file.')
    parser.add_argument('-t', help = 'The types used to filter out data row.')
    parser.add_argument('-p', default = '0', help = 'The padding for filtered rows.')
    parser.add_argument('-d', help = 'The data source file.')

    args = parser.parse_args()

    data_sources = []
    data_labels = []
    data_ids = []
    if (args.d != None):
        data_sources = utils.load_file(args.d)

        data_metainfo = regex_datasource(data_sources)

        # data_labels: flickr high interesting 1, flickr low interesting 2, pinterest [3, 4, 5]
        data_labels = data_metainfo[0]
        # data_ids: (flickr, pinterest) image id
        data_ids = data_metainfo[1]

    output = read_data(args, data_sources, data_labels)

    if (args.o != None):
        utils.write_file(output, args.o)
예제 #40
0
def load_rules(path, *, encoding='utf-8'):
    """Load parse file with 'pseudo' tree of rules.

    Args:
        path: string with a path to the file with rules,
        encoding: encoding of the file (default='utf-8')

    Return:
        dict: dictionary with pseudo tree structure, representing
              hierarchy of rules

    NOTE:
        Expected file structure:

        +- rule1_from --> rule1_to
        |  +- sub_rule1_from --> sub_rule1_to
        |  |  +- sub_sub_rule1_from --> sub_sub_rule1_to
        +- rule2_from --> rule2_to
        +- rule3_from --> rule3_to
        |  +- sub_rule3_from --> sub_rule3_to

    """
    # Load rules from a file.
    raw_rules = load_file(path, encoding=encoding)
    # Parse rules into 'pseudo' tree structure.
    return parse_rules(list(raw_rules))
예제 #41
0
def read_file(filename, n_fold):
 
    data_sources = []
    parts = []
    
    part_of_data = utils.load_file(filename)
    part_of_data = part_of_data[1:len(part_of_data)]
    part_of_data = filter_content(part_of_data)
        
    parts.append(part_of_data)
    
    parts = [item for sublist in parts for item in sublist]
    data_sources = array(parts)

    random.shuffle(data_sources)

    data_count_limit = len(data_sources) / n_fold
    folds = []
    count = 0
    for begin_index in range(0, len(data_sources), data_count_limit):
        end_index = begin_index + data_count_limit

        if (count == n_fold - 1):
            end_index = len(data_sources)

        print("begin: " + str(begin_index))
        print("end: " + str(end_index))

        folds.append(data_sources[begin_index:end_index])

        count += 1

    return folds
예제 #42
0
def read_data(args):

    data_sources = []
    header = ''
    if (args.f != None):

        parts = []
        afile = args.f

        part_of_data = utils.load_file(afile)

        if args.e != None and args.e == 'y':
            if header == '':
                header = part_of_data[0]
            part_of_data = part_of_data[1:len(part_of_data)]

        part_of_data = calculat_hsv_model(part_of_data, args.t)

        parts.append(part_of_data)

        parts = [item for sublist in parts for item in sublist]
        data_sources = array(parts)

    output = []
    if header != '':
        output.append(header)
    for item in data_sources:
        output.append(item)

    return output
예제 #43
0
def main():
    parser = argparse.ArgumentParser(description = 'Generate HITs for Amazon Mechnical Turk workers.')
    parser.add_argument('-f', help = 'The mtk data source file.')
    parser.add_argument('-o', help = 'The output file of used data.')

    args = parser.parse_args()

    data_sources = []
    if (args.f != None):
        data_sources = utils.load_file(args.f)
        random.shuffle(data_sources)

    db_collections = hit.setup_mongodb()
    data_metainfo = hit.regex_datasource(data_sources)
    images_metainfo = hit.query_imagedata_from_db(db_collections, data_metainfo)

    # data_labels: flickr high interesting 1, flickr low interesting 2, pinterest [3, 4, 5]
    data_labels = data_metainfo[0]
    # data_ids: (flickr, pinterest) image id
    data_ids = data_metainfo[1]

    data_count_limit = 50

    for begin_index in range(0, len(data_sources), data_count_limit):
        print("index: " + str(begin_index))
        generate_hits(data_sources[begin_index:begin_index + data_count_limit], begin_index, args, data_ids[begin_index:begin_index + data_count_limit], images_metainfo)

    sys.exit(0)
예제 #44
0
def main():
    parser = argparse.ArgumentParser(description = 'Generate HITs for Amazon Mechnical Turk workers.')
    parser.add_argument('-f', help = 'The mtk data source file.')
    parser.add_argument('-o', help = 'The output file of used data.')
    parser.add_argument('-m', default = 'normal', help = 'The running mode in {normal, qua_init, qua}.')
    parser.add_argument('-q', help = 'The qualification type id.')
    parser.add_argument('-t', default = 'sandbox', help = 'The type of Mechanical Turk.')


    args = parser.parse_args()

    if (args.m == 'qua' and args.q == None):
        print('Please give qualification type id if running in qualification mode.')
        sys.exit(0)

    data_sources = []
    if (args.f != None):
        data_sources = utils.load_file(args.f)
        if (args.m != 'qua'):
            random.shuffle(data_sources)

    data_count_limit = 100

    for begin_index in range(0, len(data_sources), data_count_limit):
        print("index: " + str(begin_index))
        generate_hits(args.t, data_sources[begin_index:begin_index + data_count_limit], begin_index, args)

    sys.exit(0)
예제 #45
0
def welcome(update, context):
    chat_id = update.effective_chat.id

    for new_member in update.message.new_chat_members:
        chat_title = update.message.chat.title

        if not new_member.is_bot:
            first_name = new_member.first_name
            last_name = new_member.last_name
            username = new_member.username

            if last_name is None:
                last_name = ""

            message = WELCOME_NEW_MEMBER_MESSAGE.format(
                first_name=first_name,
                last_name=last_name,
                username=username,
                chat_title=chat_title,
            )

            context.bot.sendPhoto(
                chat_id=chat_id,
                photo=load_file(
                    context=context,
                    pickle_file=DEFAULT_PICKEL_FILE_PHOTO,
                    default_photo=DEFAULT_PHOTO_WELCOME,
                    chat_id=chat_id,
                ),
                caption=message,
            )
        elif new_member.full_name == "PyLadies Brasil Bot":
            context.bot.send_message(
                chat_id=chat_id,
                text=HELLO_MESSAGE.format(chat_title=chat_title))
예제 #46
0
def add_hits(filename, all_hits):

    hits = utils.load_file(filename)
    hits = hits[0:len(hits)]
    all_hits.append(hits)

    return all_hits
def read_data(args):

    x_column = int(args.x)
    y_column = int(args.y)
    bin_length = int(args.b)
    threshold = float(args.t)
    x_threshold = int(args.d)

    data_sources = []
    if (args.f != None):

        parts = []
        afile = args.f

        part_of_data = utils.load_file(afile)
        part_of_data = calculat_hsv_figure(part_of_data, x_column, y_column, bin_length, threshold, x_threshold)
            
        parts.append(part_of_data)
        
        parts = [item for sublist in parts for item in sublist]
        data_sources = array(parts)

    output = []
    for item in data_sources:
        output.append(item)

    return output
예제 #48
0
def main(args):
    content_generator = load_file(args.transcript, encoding=args.encoding)
    rules = load_rules(args.rules, encoding=args.encoding)

    mapped = list(do_mapping(content_generator, rules))
    formatted = format_data(mapped)

    save_file(args.output, formatted, encoding=args.encoding)
예제 #49
0
def load_csv(path, encoding='utf-8'):
    for item in load_file(path, encoding=encoding):
        splitted = item.split(';')
        # Change to SINGLE word file format
        splitted[2] = splitted[2][:-3]
        splitted[3] = splitted[3][:-3]

        yield splitted
예제 #50
0
def main(args):
    content_generator = load_file(args.transcript, encoding=args.encoding)
    rules = load_rules(args.rules, encoding=args.encoding)

    mapped = do_mapping(content_generator, rules)
    cleaned = clean(mapped)
    formatted = mlf_format_data(cleaned)

    save_file(args.output, formatted, encoding=args.encoding)
예제 #51
0
 def load(cls):
     """
     Load rules from config file.
     """
     if cls._loaded_rules is None:
         log("Loading contextual rules...", "CYAN", True)
         lx = load_file("corpus/contextual_rules.rls")
         cls._loaded_rules = [r for r in lx.split(u"\n") if len(r) > 1]
     return cls._loaded_rules
예제 #52
0
 def load(cls):
     if cls._loaded_rules is None:
         log("Loading lemmatizer rules...", "CYAN", True)
         lx = load_file("corpus/lemmatizer_rules.rls")
         cls._loaded_rules = []
         for line in lx.split(u"\n"):
             els = line.split(u"\t")
             if els[0] != u"":
                 cls._loaded_rules.append(els[0])
     return cls._loaded_rules
예제 #53
0
def main(args):
    # Load alignment of the phonemes
    alignment = load_mlf_to_dict(args.mlf, clean=False)
    # Merge items in the mlf if the output is from AP decoder
    if (args.a):
        alignment = process_ap(alignment)
    # Load path to the processed WAV files, should corresponds to the alignment
    paths = load_file(args.scp)
    # Load WAV files
    waves = get_wave(paths)
    # Use Noiser library to the changing of the tempo
    process(waves, alignment, args.output, int(args.tempo), args.skip)
예제 #54
0
 def setUp(self):
     super(TestPublish, self).setUp()
     self.user = User('*****@*****.**')
     self.login('*****@*****.**')
     self.issue = models.Issue(subject='test')
     self.issue.local_base = False
     self.issue.put()
     self.ps = models.PatchSet(parent=self.issue, issue=self.issue)
     self.ps.data = load_file('ps1.diff')
     self.ps.save()
     self.patches = engine.ParsePatchSet(self.ps)
     db.put(self.patches)
예제 #55
0
 def setUp(self):
   super(TestStatusListener, self).setUp()
   self.user = users.User('*****@*****.**')
   self.login('*****@*****.**')
   self.issue = models.Issue(subject='test')
   self.issue.local_base = False
   self.issue.put()
   self.ps = models.PatchSet(parent=self.issue.key, issue_key=self.issue.key)
   self.ps.data = load_file('ps1.diff')
   self.ps.put()
   self.patches = engine.ParsePatchSet(self.ps)
   ndb.put_multi(self.patches)
   self.logout() # Need to log out for /status_listener to work
예제 #56
0
def read_data(args):

    data_sources = []
    if (args.f != None):
        if not isinstance(args.f, basestring):
            parts = []
            for afile in args.f:
                part_of_data = utils.load_file(afile)
                parts.append(part_of_data.tolist())

            parts = [item for sublist in parts for item in sublist]
            data_sources = array(parts)
        else:
            data_sources = utils.load_file(args.f)


    hits = []
    head_of_hits = ''
    if (args.m != None):
        if not isinstance(args.f, basestring):
            parts = []
            for afile in args.m:
                part_of_data = utils.load_file(afile)
                if head_of_hits == '':
                    head_of_hits = part_of_data[0]
                part_of_data = part_of_data[1:len(part_of_data)]
                parts.append(part_of_data.tolist())
 
            parts = [item for sublist in parts for item in sublist]
            hits = array(parts)
 
        else:
            hits = utils.load_file(args.m)
            hits = hits[0]
            hits = hits[1:len(hits)]

    return (data_sources, hits, head_of_hits)
예제 #57
0
def main():
    parser = argparse.ArgumentParser(description = 'Generate HITs for Amazon Mechnical Turk workers.')
    parser.add_argument('-f', help = 'The mtk data source file.')
    parser.add_argument('-o', help = 'The output file of used data.')

    args = parser.parse_args()

    data_sources = []
    if (args.f != None):
        data_sources = utils.load_file(args.f)
        random.shuffle(data_sources)

    generate_hits(data_sources, args)

    sys.exit(0)
예제 #58
0
파일: lexicon.py 프로젝트: liberation/sulci
    def loaded(self):
        """
        Load lexicon in RAM, from file.

        The representation will be a dict {"word1": [{tag1 : lemme1}]}
        """
        if not self.PATH in self._loaded:  # Caching and lazy loading
            sulci_logger.debug("Loading lexicon...", "RED", True)
            lx = load_file("%s/lexicon.lxc" % self.PATH)
            self._loaded[self.PATH] = {}
            for line in lx.split("\n"):
                if line:
                    lexicon_entity = LexiconEntity(line)
                    self.add_factors(lexicon_entity.word)
                    self._loaded[self.PATH][lexicon_entity.word] = lexicon_entity
        return self._loaded[self.PATH]
예제 #59
0
def read_file(filename):

    data_sources = []
    parts = []

    part_of_data = utils.load_file(filename)
    part_of_data = part_of_data[1 : len(part_of_data)]
    part_of_data = filter_content(part_of_data)

    parts.append(part_of_data)

    parts = [item for sublist in parts for item in sublist]

    dictionary = {}
    for item in parts:
        dictionary[item] = 1

    return dictionary