def prepare_data(input_file, colname, label_colname=None):
    if label_colname is None:
        X = read_data(input_file, colname)
    else:
        X, Y = read_data(input_file, colname, label_colname)
    #c = Cleaner()
    print("starting to clean data..")
    #X = [c.full_clean(text) for text in X]
    X = [text for text in X]
    if label_colname is None:
        return X
    return X, Y
def test():
    filename = 'r03.1_R_H_20.dow' if not len(argv) > 1 else argv[1]
    data = read_data(filename)
    start = time.time()
    lagrange_relaxation(data)
    stop = time.time()
    print 'Lagrange relaxation done. Time: {} seconds'.format(stop - start)
Exemplo n.º 3
0
Arquivo: int.py Projeto: timbk/pmt_sim
def button_load_ref(event):  # handler for BUTTON_LREF clicked
    global CURR_REF_DATA, CURR_REF_PLOT
    is_tia = messagebox.askquestion('TIA?',
                                    'Do you want to load a TIA measurement?')
    fname = filedialog.askopenfilename()
    print(fname)
    if type(fname) is str:
        CURR_REF_DATA, extra_info, _, soffset = read_data(fname)
        print(is_tia)
        if is_tia == 'yes':
            CURR_REF_DATA = np.array(CURR_REF_DATA) / 2.
        CURR_REF_DATA = np.histogram(CURR_REF_DATA,
                                     bins=NBIN,
                                     range=(-5e6, 3.5e7))

        if CURR_REF_PLOT is not None:
            CURR_REF_PLOT.remove()
            del CURR_REF_PLOT
        print(CURR_REF_DATA[0])
        rD = CURR_REF_DATA[1][1] - CURR_REF_DATA[1][0]
        CURR_REF_PLOT = ax.errorbar(CURR_REF_DATA[1][:-1] + rD / 2,
                                    CURR_REF_DATA[0],
                                    np.sqrt(CURR_REF_DATA[0]),
                                    ls='none',
                                    c='k',
                                    label='Refference',
                                    zorder=1000000)
        plt.legend()
def test():
    time_start = time.time()
    global DEBUG
    DEBUG = False
    data = read_data('c38_R_H_5.dow')
    heuristic_main(data, return_primal=True)
    time_finish = time.time()
    print 'Total time: {} s'.format(time_finish - time_start)
def test():
    time_start = time.time()
    global DEBUG
    DEBUG = False
    filename = 'r01.1_R_H_10.dow' if len(argv) <= 1 else argv[1]
    data = read_data(filename)
    objective, open_arcs = heuristic_main(data)
    objective, model = make_local_branching_model(
        data, 2, open_arcs, objective)
    print 'objective: {}'.format(objective)
    time_finish = time.time()
    print 'Total time: {} s'.format(time_finish - time_start)
def main():
    filename = 'r03.1_R_H_20.dow' if len(argv) <= 1 \
        else argv[1]
    data = read_data(filename)
    start = time()
    data.graph = make_graph(data)
    subproblems = populate_dual_subproblem(data)
    master = populate_master(data)
    master = add_in_out_cuts(master, subproblems, data)
    master_callback = callback_data(subproblems, data)
    master.optimize(master_callback)
    stop = time()
    print('Total time: {} seconds'.format(round(stop - start, 0)))
def test():
    root_path = '../../../MPMCFP_DataGen/'
    data_path_c = 'c_Instances_Dec_Fixed_Cost/'
    c_trial, r_trial = 'c33_R_H_10.dow', 'r01.1_R_H_10.dow'
    filename = root_path + data_path_c
    filename += c_trial
    data = read_data(filename)
    models = []
    for t in xrange(data.periods - 1):
        model = Model(data, t)
        delta = np.ones(shape=data.commodities)
        model.solve(data)
        print 'period: {} model status: {}'.format(t, model.status)
        models.append(model)
Exemplo n.º 8
0
def main():
    filename = 'r03.1_R_H_20.dow' if len(argv) <= 1 \
        else argv[1]
    data = read_data(filename)
    start = time()
    data.graph = make_graph(data)
    # objective, open_arcs = heuristic(data, 2, 'u')[:2]
    # print 'Heuristic objective value: {}'.format(objective)
    subproblems = populate_dual_subproblem(data)
    master = populate_master(data)
    master_callback = callback_data(subproblems, data)
    master.optimize(master_callback)
    stop = time()
    print 'Total time: {} seconds'.format(round(stop - start, 0))
Exemplo n.º 9
0
def test():
    time_start = time.time()
    global DEBUG
    DEBUG = False
    root_path = './'
    r_trial = 'r03.1_R_H_20.dow'
    filename = root_path
    filename += r_trial if len(argv) <= 1 else argv[1]
    filename = argv[1] if platform.system() == 'Linux' else filename
    data = read_data(filename)
    # Uncomment for uncapacitated problems
    # data.capacity = np.array([10e+9] * len(data.capacity), dtype=float)
    objective, open_arcs = heuristic_main(data)
    print 'objective: {}'.format(objective)
    time_finish = time.time()
    print 'Total time: {} s'.format(time_finish - time_start)
def test():
    time_start = time.time()
    global DEBUG
    DEBUG = False
    root_path = '../../../DataDeterministicFC/'
    data_path_c = 'c_Instances_Dec_Fixed_Cost/'
    data_path_r = 'r_Instances_Dec_Fixed_Cost/'
    c_trial, r_trial = 'c33_R_H_10.dow', 'r01.1_R_H_10.dow'
    filename = root_path + data_path_c
    filename += c_trial if len(argv) <= 1 else argv[1]
    filename = argv[1] if platform.system() == 'Linux' else filename
    data = read_data(filename)
    data.capacity = np.array([10e+9] * len(data.capacity), dtype=float)
    objective, open_arcs, flow_cost = heuristic_main(data)
    objective, open_arcs = make_local_branching_model(data, 10, open_arcs)
    print 'objective: {}'.format(objective)
    time_finish = time.time()
    print 'Total time: {} s'.format(time_finish - time_start)
Exemplo n.º 11
0
    def create_TD(self,
                  src_directory=SRC_DIRECTORY,
                  src_extension=SRC_EXTENSION,
                  txt_directory=TXT_DIRECTORY,
                  indices_file=INDICES_FILE,
                  concat_file=CONCAT_FILE,
                  batch_size=BATCH_SIZE,
                  from_XML=True):
        if from_XML:
            src_to_txts(src_directory, src_extension, txt_directory)
            #src directory contains the XMLs or whatever

        concat_txts(txt_directory=txt_directory, target_name=concat_file)

        vocab = read_data(open(concat_file, "r"))

        #dict takes in words and spits out indices, r_dict the opposite
        c_indices, count, dict, r_dict = collect_data(vocab, VOC_SIZE)

        #create files where each line is the index of the word
        self.input_idx, self.output_idx = write_indices_files(
            concat_file, dict, indices_file)

        self.input_scrubbed = scrub(self.input_idx)
        self.output_scrubbed = scrub(self.output_idx)

        #use the above files to create our training data files
        self.input_sorted, self.output_sorted = sort(self.input_scrubbed,
                                                     self.output_scrubbed,
                                                     BATCH_SIZE)

        self.input_padded = pad(self.input_sorted, batch_size)
        self.output_padded = pad(self.output_sorted, batch_size)

        self.input_reversed = reverse(self.input_padded)

        self.input_reversed, self.output_padded = remove_bad_batches(
            self.input_reversed, self.output_padded, batch_size)

        self.batchlist, self.input_td, self.input_valid, self.tdcount, self.validcount = make_input_val(
            VAL_PCT, self.input_reversed, BATCH_SIZE)

        self.output_td, self.output_valid = make_output_val(
            self.output_padded, BATCH_SIZE, self.batchlist)
Exemplo n.º 12
0
def main():
    config = helpers.read_config()
    elogger = logger.get_logger()

    # initialize arrays for short-term and long-term traffic features
    speed_array = 'speeds'
    time_array = 'times'
    short_ttf = [[
        collections.defaultdict(lambda: {
            speed_array: [],
            time_array: []
        }) for _ in range(256)
    ] for _ in range(256)]
    long_ttf = [[
        collections.defaultdict(lambda: {
            speed_array: [],
            time_array: []
        }) for _ in range(256)
    ] for _ in range(256)]

    for data_file in config['data']:
        elogger.info(
            'Generating G and T paths and extracting traffic features on {} ...'
            .format(data_file))

        data = helpers.read_data(data_file)

        define_travel_grid_path(data, config['coords'], short_ttf, long_ttf,
                                args.grid_size)

        elogger.info(
            'Saving extended with G and T paths data in {}{}.\n'.format(
                args.data_destination_folder, data_file))
        helpers.save_processed_data(data, args.data_destination_folder,
                                    data_file)

    elogger.info('Aggregate historical traffic features ...')
    helpers.aggregate_historical_data(short_ttf, long_ttf)
    elogger.info('Saving extracted traffic features in {}'.format(
        args.ttf_destination_folder))
    helpers.save_extracted_traffic_features(short_ttf, long_ttf,
                                            args.ttf_destination_folder)
Exemplo n.º 13
0
def test():

    data = read_data('small7.dow')
    graph = make_graph(data)
    make_adjacency_matrix(graph, data)
Exemplo n.º 14
0
from helpers import read_data, get_settings, package_translation
import api

settings = get_settings()
article_map = read_data('article_map')
locales = ['de', 'es', 'fr', 'ja', 'pt-br']

for article in article_map:
    url = '{}/articles/{}/translations/missing.json'.format(
        settings['src_root'], article)
    missing_locales = api.get_resource_list(url,
                                            list_name='locales',
                                            paginate=False)
    for locale in locales:
        if locale in missing_locales:  # if translation missing in src, nothing to move
            continue
        print('Moving {} translation for article {}'.format(locale, article))

        # get translation in src hc
        url = '{}/articles/{}/translations/{}.json'.format(
            settings['src_root'], article, locale)
        translation = api.get_resource(url)

        # create translation in dest hc
        url = '{}/articles/{}/translations.json'.format(
            settings['dst_root'], article_map[article])
        payload = package_translation(translation)
        api.post_resource(url, payload)

print('\nFinished moving translations.\n')
Exemplo n.º 15
0
def main(data, num_generations, num_trees, fold, seed, model_file, blackbox_model):
    ###########
    kf = StratifiedKFold(shuffle=True, n_splits=10, random_state=seed)
    X, y = read_data("data/"+data+".csv")

    # Split the data based on the fold of this run
    train_index, test_index = list(kf.split(X, y))[fold]
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # H2o requires specially formatted data
    h2_train = h2o.H2OFrame(python_obj=np.hstack((X_train, y_train)))
    h2_test = h2o.H2OFrame(python_obj=np.hstack((X_test, y_test)))

    # =================
    # Train the complex model
    # =================

    blackbox_options = {
        "RF": H2ORandomForestEstimator(ntrees=100),
        "GB": H2OGradientBoostingEstimator(ntrees=100),
        "DL": H2ODeepLearningEstimator(epochs=1000),
    }

    # Choose the model based on the given parameter
    blackbox = blackbox_options[blackbox_model]

    blackbox.train(x=h2_train.columns[:-1], y=h2_train.columns[-1], training_frame=h2_train)

    # We use the predictions from the model as the new "labels" for training surrogate.
    blackbox_train_predictions = blackbox.predict(h2_train)["predict"].as_data_frame().values
    blackbox_train_score = scorer(blackbox_train_predictions, y_train)

    blackbox_test_predictions = blackbox.predict(h2_test)["predict"].as_data_frame().values
    blackbox_test_score = scorer(blackbox_test_predictions, y_test)

    print("The " + blackbox.__class__.__name__ + " achieved", "%.2f" % blackbox_train_score, "on the train set and",
          "%.2f" % blackbox_test_score, "on the test set")

    # =================
    # Train the surrogates
    # =================

    dt_training_recreating_pct, dt_testing_recreating_pct, dt_complexity = \
        decision_tree(X_train, blackbox_train_predictions, X_test, blackbox_test_predictions)

    print("DT was able to recreate %.2f%%" % dt_training_recreating_pct, "of them on the train, and %.2f%%" %
          dt_testing_recreating_pct, "on the test set")

    sdt_training_recreating_pct, sdt_testing_recreating_pct, sdt_complexity = \
        simplified_decision_tree(X_train, blackbox_train_predictions, X_test, blackbox_test_predictions)

    print("SDT was able to recreate %.2f%%" % sdt_training_recreating_pct, "of them on the train, and %.2f%%" %
          sdt_testing_recreating_pct, "on the test set")

    ds_training_recreating_pct, ds_testing_recreating_pct, ds_complexity = \
        decision_stump(X_train, blackbox_train_predictions, X_test, blackbox_test_predictions)

    print("DS was able to recreate %.2f%%" % ds_training_recreating_pct, "of them on the train, and %.2f%%" %
          ds_testing_recreating_pct, "on the test set")

    lr_training_recreating_pct, lr_testing_recreating_pct, lr_complexity = \
        logistic_regression(X_train, blackbox_train_predictions, X_test, blackbox_test_predictions)

    print("LR was able to recreate %.2f%%" % lr_training_recreating_pct, "of them on the train, and %.2f%%" %
          lr_testing_recreating_pct, "on the test set")

    '''
    brl_training_recreating_pct, brl_testing_recreating_pct, brl_complexity = \
        bayesian_rule_list(X_train, blackbox_train_predictions, X_test, blackbox_test_predictions)

    print("BRL was able to recreate %.2f%%" % brl_training_recreating_pct, "of them on the train, and %.2f%%" %
          brl_testing_recreating_pct, "on the test set")
          '''

    gp_training_recreating_pct, gp_testing_recreating_pct, gp_complexity = \
        genetic_program(X_train, blackbox_train_predictions, X_test, blackbox_test_predictions, num_generations,
                        num_trees, model_file)

    print("GP was able to recreate %.2f%%" % gp_training_recreating_pct, "of them on the train, and %.2f%%" %
          gp_testing_recreating_pct, "on the test set")

    return [blackbox_train_score, blackbox_test_score,
            dt_training_recreating_pct, dt_testing_recreating_pct, dt_complexity,
            sdt_training_recreating_pct, sdt_testing_recreating_pct, sdt_complexity,
            ds_training_recreating_pct, ds_testing_recreating_pct, ds_complexity,
            lr_training_recreating_pct, lr_testing_recreating_pct, lr_complexity,
            brl_training_recreating_pct, brl_testing_recreating_pct, brl_complexity,
            gp_training_recreating_pct, gp_testing_recreating_pct, gp_complexity]
Exemplo n.º 16
0
import arrow

from helpers import read_data, write_data, get_settings
import api

settings = get_settings()
sync_dates = read_data('sync_dates')
last_sync = arrow.get(sync_dates['article_votes'])
article_map = read_data('article_map')

for src_article in article_map:
    dst_article = article_map[src_article]
    print('\nGetting votes for article {}...'.format(src_article))
    url = '{}/{}/articles/{}/votes.json'.format(settings['src_root'],
                                                settings['locale'],
                                                src_article)
    votes = api.get_resource_list(url)
    if not votes:
        print('- no votes found')
        continue
    for vote in votes:
        if last_sync < arrow.get(vote['created_at']):
            print('- adding vote {} to article {}'.format(
                vote['id'], dst_article))
            if vote['value'] == -1:
                url = '{}/articles/{}/down.json'.format(
                    settings['dst_root'], dst_article)
            else:
                url = '{}/articles/{}/up.json'.format(settings['dst_root'],
                                                      dst_article)
            payload = {
Exemplo n.º 17
0
import arrow

from helpers import read_data, write_data, get_settings, package_article, verify_author, write_js_redirects
import api

settings = get_settings()
sync_dates = read_data('sync_dates')
last_sync = arrow.get(sync_dates['articles'])
section_map = read_data('section_map')
article_map = read_data('article_map')
exceptions = read_data('exceptions')

for section in section_map:
    # # test-only section (ref docs) -> comment out for sync
    # if section != "206223768":
    #     continue
    dst_section = section_map[section]
    print('\nGetting articles in section {}...'.format(section))
    url = '{}/{}/sections/{}/articles.json'.format(settings['src_root'],
                                                   settings['locale'], section)
    articles = api.get_resource_list(url)
    for src_article in articles:
        if str(src_article['id']) in exceptions:
            print('{} is an exception. Skipping...'.format(src_article['id']))
            continue
        if last_sync < arrow.get(src_article['created_at']):
            print('- adding article {} to destination section {}'.format(
                src_article['id'], dst_section))
            src_article['author_id'] = verify_author(src_article['author_id'],
                                                     settings['team_user'])
            url = '{}/{}/sections/{}/articles.json'.format(
Exemplo n.º 18
0
import arrow

from helpers import read_data, write_data, get_settings
import api

settings = get_settings()
sync_dates = read_data('sync_dates')
last_sync = arrow.get(sync_dates['comment_votes'])
article_map = read_data('article_map')
comment_map = read_data('comment_map')
comment_article_map = read_data('comment_article_map')

for src_comment in comment_map:
    src_article = comment_article_map[src_comment]
    dst_article = article_map[src_article]
    dst_comment = comment_map[src_comment]
    print('Getting votes for comment {}...'.format(src_comment))
    url = '{}/{}/articles/{}/comments/{}/votes.json'.format(
        settings['src_root'], settings['locale'], src_article, src_comment)
    votes = api.get_resource_list(url)
    if not votes:
        print('- no votes found')
        continue
    for vote in votes:
        if last_sync < arrow.get(vote['created_at']):
            print('- adding vote {} to comment {}'.format(
                vote['id'], dst_comment))
            if vote['value'] == -1:
                url = '{}/articles/{}/comments/{}/down.json'.format(
                    settings['dst_root'], dst_article, dst_comment)
            else:
def test():
    data = read_data('c33_R_H_5.dow')
    heur_solution = heuristic_main(data=data,
                                   return_primal=True,
                                   track_time=True)[2]
    master_model = make_master(data=data, heur_solution=heur_solution)
Exemplo n.º 20
0
# In[]:
from helpers import read_data

# In[]:
Aquifer_Auser = read_data('Aquifer_Auser.csv')
Aquifer_Doganella = read_data('Aquifer_Doganella.csv')
Aquifer_Luco = read_data('Aquifer_Luco.csv')
Aquifer_Petrignano = read_data('Aquifer_Petrignano.csv')
Lake_Bilancino = read_data('Lake_Bilancino.csv')
River_Arno = read_data('River_Arno.csv')
Water_Spring_Amiata = read_data('Water_Spring_Amiata.csv')
Water_Spring_Lupa = read_data('Water_Spring_Lupa.csv')
Water_Spring_Madonna_di_Canneto = read_data(
    'Water_Spring_Madonna_di_Canneto.csv')

# In[]:
River_Arno.head().T

# In[]:
print('Gabriel was here')
Exemplo n.º 21
0
from helpers import read_data, get_settings
import api


settings = get_settings()
section_map = read_data('section_map')

for section in section_map:
    dst_section = section_map[section]
    print('\nGetting subscriptions for section {}...'.format(section))
    url = '{}/{}/sections/{}/subscriptions.json'.format(settings['src_root'], settings['locale'], section)
    subscriptions = api.get_resource_list(url)
    if not subscriptions:
        print('- no subscriptions found')
        continue
    for sub in subscriptions:
        print('- adding subscription {} to section {}'.format(sub['id'], dst_section))
        url = '{}/sections/{}/subscriptions.json'.format(settings['dst_root'], dst_section)
        if sub['include_comments'] is True:
            payload = {'subscription': {'source_locale': settings['locale'], 'user_id': sub['user_id'],
                                        'include_comments': True}}
        else:
            payload = {'subscription': {'source_locale': settings['locale'], 'user_id': sub['user_id']}}
        response = api.post_resource(url, payload)
        if response is False:
            print('Skipping subscription {}'.format(sub['id']))
Exemplo n.º 22
0
helper_dir = r'C:\Users\oyina\src\senior_2019-2020\lab\bijsterbosch\project\oyin'

sys.path.append(helper_dir)
import helpers as help
import importlib


#%% updating help
help = importlib.reload(help)

#%% read in data from the two sites
siteB_file = "FNETs_siteB.txt"
siteH_file = "FNETs_siteH.txt" 
num_regions = 10

site_B_data = help.read_data(siteB_file)
site_H_data = help.read_data(siteH_file)

#%% create x data

# initialize array in which to hold site data; i is for channel dimension
site_B_connectomes = np.ones((len(site_B_data), 1, num_regions, num_regions))
site_H_connectomes = np.ones((len(site_H_data), 1, num_regions, num_regions))

# create data matrices
for person in range(len(site_B_data)):
    site_B_connectomes[person, :, :, :] = help.list_to_connectome(site_B_data[person], num_regions)
for person in range(len(site_H_data)):
    site_H_connectomes[person, :, :, :] = help.list_to_connectome(site_B_data[person], num_regions)

#%% create y data
Exemplo n.º 23
0
# parse parameters
IS_TIA = '--tia' in sys.argv
sys.argv = [i for i in sys.argv if i[0] != '-']
if len(sys.argv) != 3:
    print(USAGE)
    exit(1)
print('sys.argv', sys.argv)
fname = sys.argv[1]
iv = eval(sys.argv[2])
assert type(iv) == tuple
print('paramter format: %s' % repr(IV_FORMAT))
print('using %s as iv' % repr(iv))

# read measurement data
cmp_data, extra_info, _, soffset = read_data(fname)
cmp_data = np.array(cmp_data)
if IS_TIA:
    cmp_data /= 2.

# do hist of meas. data
hist, bins = np.histogram(cmp_data, bins=NBIN, range=BIN_RANGE)
cmp_Y = hist
D = bins[1] - bins[0]
cmp_X = bins[:-1] + D / 2.

fname = ('.'.join(fname.split('.')[:-1])).split('/')[-1]

# do the noise fit
sel = cmp_Y > (max(cmp_Y) / 10.)
noise_fit_X = cmp_X[sel]
Exemplo n.º 24
0
def train(
        input_file="clean_train.csv",
        text_col="question_text",
        label_col="target",
        valid_ratio=0.2,
        max_sentence_length=91,
        sample_percent=1,
        class_weights=None,
        cell_type="gru",
        embedding="word2vec",
        embedding_path="GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin",
        embedding_dim=300,
        rnn_layers=3,
        hidden_size=128,
        one_minus_dropout=0.5,
        l2_reg=3.0,
        batch_size=32,
        epochs=5,
        learning_rate=1e-3,
        allow_soft_placement=True,
        log_device_placement=False,
        display_every=10,
        evaluate_every=100,
        checkpoint_every=100,
        num_checkpoints=5):
    # Load and split data
    print("Loading data..")
    X, Y = read_data(input_file,
                     text_col,
                     label_col,
                     sample_percent=sample_percent)

    # Create a vocanulary process
    # Its job is to assign each unique word an integer and then our sentences replace each word it's corresponding integer.
    # These mappings are later used again to substitue each word with its embedding
    # This method also trims or adds trailing zeros to padd and fit each sentence to a specific length
    print("Setting up vocabulary..")
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        max_sentence_length)
    X = np.array(list(vocab_processor.fit_transform(X)))
    print("Vocabulary Size: ", len(vocab_processor.vocabulary_))
    num_classes = len(Y[0])

    # split in to train and validation
    X, Y, x_val, y_val = split_data(X, Y, valid_ratio)

    # initialize tensorflow config
    print("Initializing tensorflow session..")
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=allow_soft_placement,
            log_device_placement=log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            print("Initializing our RNN:")
            print("\nseq_length : ", X.shape[1], "\nnum_classes : ",
                  Y.shape[1], "\nvocab_size : ",
                  len(vocab_processor.vocabulary_), "\nembedding_size : ",
                  embedding_dim, "\ncell_type : ", cell_type,
                  "\nhidden_size : ", hidden_size, "\nl2 : ", l2_reg,
                  "\nclass_weights :  ", class_weights, "\nbatch_size : ",
                  batch_size, "\nrnn_layers :  ", rnn_layers)
            # Initiazlie our RNN
            rnn = RNN(seq_length=X.shape[1],
                      num_classes=Y.shape[1],
                      vocab_size=len(vocab_processor.vocabulary_),
                      embedding_size=embedding_dim,
                      cell_type=cell_type,
                      hidden_size=hidden_size,
                      l2=l2_reg,
                      class_weights=class_weights,
                      batch_size=batch_size,
                      rnn_layers=rnn_layers)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            train_op = tf.train.AdamOptimizer(learning_rate).minimize(
                rnn.loss, global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", rnn.loss)
            acc_summary = tf.summary.scalar("accuracy", rnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Validation summaries
            val_summary_op = tf.summary.merge([loss_summary, acc_summary])
            val_summary_dir = os.path.join(out_dir, "summaries", "val")
            val_summary_writer = tf.summary.FileWriter(val_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")

            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=num_checkpoints)

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "text_vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # Initializing pretrained embeddings if embedding flag is up
            if embedding:
                # initial matrix with random uniform
                initW = np.random.uniform(
                    -0.25, 0.25,
                    (len(vocab_processor.vocabulary_), embedding_dim))

                # In case of glove, loading embedings is pretty easy
                # Just read each line, first word is the word
                # and evey thing else on the line is a vector embedding for that vector
                if "glove" in embedding:
                    with open(embedding_path, "r", encoding="utf8") as f:
                        for line in f:
                            first_word = line.partition(' ')[0]
                            rest = line[line.index(' ') + 1:]
                            # Find if word in our vocabulary
                            idx = vocab_processor.vocabulary_.get(first_word)
                            if idx != 0:
                                # If yes then substitue the glove embedding for it instead of the random one
                                initW[idx] = np.fromstring(rest,
                                                           dtype='float32',
                                                           sep=" ")
                # In case of word2vec, we are given a bin file
                elif "word2vec" in embedding:
                    with open(embedding_path, "rb") as f:
                        # First line is header containing information about number of records and size of one record
                        header = f.readline()
                        vocab_size, layer1_size = map(int, header.split())
                        # Then, number of bytes in each record  = (size of a float) * size of one record
                        binary_len = np.dtype('float32').itemsize * layer1_size
                        # for each record
                        for line in range(vocab_size):
                            word = []
                            while True:
                                # Keep reading a charachter
                                ch = f.read(1).decode('latin-1')
                                if ch == ' ':
                                    # until you find a space, then the first word is complete
                                    word = ''.join(word)
                                    break
                                if ch != '\n':
                                    word.append(ch)
                            # Try to find that first word in our vocabulary
                            idx = vocab_processor.vocabulary_.get(word)
                            if idx != 0:
                                # if found, add substitue the corespoding embedding vector with the random vector
                                initW[idx] = np.fromstring(f.read(binary_len),
                                                           dtype='float32')
                            else:
                                f.read(binary_len)

                sess.run(rnn.W_text.assign(initW))
                print("Successful to load ", embedding, "!\n")

            # Once we are done with the embeddings and basic tensorflow settings
            # We now start with actual training routine

            # Generate batches
            itr = batch_iterator(X, Y, batch_size, epochs)
            # For each batch
            for x_batch, y_batch, start, end in itr:
                # Train
                feed_dict = {
                    rnn.input_text: x_batch,
                    rnn.input_label: y_batch,
                    rnn.keep_prob: one_minus_dropout
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, rnn.loss,
                    rnn.accuracy
                ], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % display_every == 0:
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))

                # Evaluation
                if step % evaluate_every == 0:
                    print("\nEvaluation:")
                    total_preds = np.zeros(y_val.shape)
                    itr2 = batch_iterator(x_val,
                                          y_val,
                                          batch_size,
                                          1,
                                          shuffle=False)
                    avg_acc = 0
                    avg_loss = 0
                    steps = 0
                    for x_eval_batch, y_eval_batch, s, e in itr2:
                        feed_dict_val = {
                            rnn.input_text: x_eval_batch,
                            rnn.input_label: y_eval_batch,
                            rnn.keep_prob: 1.0
                        }
                        summaries_val, loss, accuracy, preds = sess.run([
                            val_summary_op, rnn.loss, rnn.accuracy,
                            rnn.predictions
                        ], feed_dict_val)
                        val_summary_writer.add_summary(summaries_val, step)
                        k = np.array([
                            one_hot_encode(num_classes, label)
                            for label in preds
                        ])
                        avg_acc += accuracy
                        avg_loss += loss
                        steps += 1
                        total_preds[s:e] = k
                    cf, f_score = confusion_matrix(y_val, total_preds, 2)
                    avg_acc /= steps
                    avg_loss /= steps
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: loss {:g}, acc {:g}, fscore {:g}\n".format(
                        time_str, avg_loss, avg_acc, f_score))
                    print("Confusion Matrix")
                    print(cf)
                # Model checkpoint
                if step % checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=step)
                    print("Saved model checkpoint to {}\n".format(path))
Exemplo n.º 25
0
import arrow

from helpers import read_data, write_data, get_settings, package_article, verify_author, write_js_redirects
import api

settings = get_settings()
sync_dates = read_data('sync_dates')
last_sync = arrow.get(sync_dates['articles'])
section_map = read_data('section_map')
article_map = read_data('article_map')
user_segment_map = read_data('user_segment_map')
permission_group_map = read_data('permission_group_map')
exceptions = read_data('exceptions')

for section in section_map:
    # # test-only section (ref docs) -> comment out for sync
    # if section != "206223768":
    #     continue
    dst_section = section_map[section]
    print('\nGetting articles in section {}...'.format(section))
    url = '{}/{}/sections/{}/articles.json'.format(settings['src_root'],
                                                   settings['locale'], section)
    articles = api.get_resource_list(url)
    for src_article in articles:
        if settings['cross_instance']:
            if src_article['user_segment_id'] != None:
                dst_user_segment = user_segment_map[str(
                    src_article['user_segment_id'])]
            dst_permission_group = permission_group_map[str(
                src_article['permission_group_id'])]
            src_article['user_segment_id'] = dst_user_segment
Exemplo n.º 26
0
    def inference(self, weights_file=None, load=False):

        vocab = read_data(open(CONCAT_FILE, "r"))

        # dict takes in words and spits out indices, r_dict the opposite
        c_indices, count, dict, r_dict = collect_data(vocab, VOC_SIZE)

        inference_encInput = Input(shape=(None, ),
                                   dtype="int32",
                                   name="input_seqinf")

        temp = self.emb_A(inference_encInput)
        temp = self.enc1(temp)
        #temp = self.enc2(temp)
        temp = self.enc3(temp)
        temp, h, c = self.enc4(temp)

        #inf_emb = Embedding(output_dim=2, input_dim=5, input_length=None, mask_zero=True, name='embA')
        #inf_enc4 = LSTM(2, return_sequences=True, return_state=True, name='enc4')

        #inf_dec4 = LSTM(2, return_sequences=True, return_state=True, name='dec4')

        #inf_dense = Dense(units=5, activation='softmax',
        # kernel_initializer=TruncatedNormal(mean=0., stddev = 0.05, seed = None),
        #name="dense1")

        #temp = inf_emb(inference_encInput)
        #temp, h, c = inf_enc4(temp)

        enc_model = Model(inputs=inference_encInput, outputs=[temp, h, c])
        #encoder_states is our context for decoding

        #load the weights obtained during some training
        if load == True:
            enc_model.load_weights(weights_file, by_name=True)
        #print("during inference, enc weights are " + str(enc_model.layers[2].get_weights()))

        #by_name attaches the weights according to the names of the layers. using the instance variables
        #facilitates keeping the names constant.

        inference_decInput = Input(shape=(None, ),
                                   dtype="int32",
                                   name="input_decinf")
        dec_state_input_h = Input(shape=(HID_DIM, ))
        dec_state_input_c = Input(shape=(HID_DIM, ))
        dec_states_inputs = [dec_state_input_h, dec_state_input_c]

        temp = self.emb_A(inference_decInput)
        temp = self.dec1(temp, initial_state=dec_states_inputs)
        #temp = self.dec2(temp)
        temp = self.dec3(temp)

        #temp = inf_emb(inference_decInput)

        inference_decOutput, state_h, state_c = self.dec4(temp)
        #inference_decOutput = self.dec4(temp)
        dec_states = [state_h, state_c]

        inference_decOutput = TimeDistributed(
            self.emb_to_vocab)(inference_decOutput)

        #inference_decOutput = Softmax(name="final_output")(inference_decOutput)

        dec_model = Model(
            inputs=[inference_decInput, dec_state_input_h, dec_state_input_c],
            outputs=[inference_decOutput] + dec_states)
        #dec_model = Model(inputs=[inference_decInput]+dec_states_inputs, outputs=inference_decOutput)

        if load == True:
            dec_model.load_weights(weights_file, by_name=True)

        print(
            "Begin conversation. Try not to use contractions.\n Punctuate end of sentences. "
            "Capitalization does not matter. Input <quit> to quit.\n")

        while True:  #each iteration of this while loop takes input and predicts a response.
            indices = [dict["<sos>"]]
            eos = False
            count = 0
            text = input("")
            text = text.strip().split()
            if len(text) == 1 and text[0] == "<quit>":
                print("Conversation ended.\n")
                break
            for word in text:
                temp = ""

                if eos:
                    indices.append(dict["<sos>"])
                    eos = False

                #change the word into something the model understands
                word = word.lower()
                if word[-1] in [".", "!", "?"]:
                    eos = True
                    word = word[:-1]
                for char in word:
                    if char.isalpha():
                        temp += char
                word = temp

                if word in dict:
                    indices.append(dict[word])
                    if eos:
                        indices.append(dict["<eos>"])

                else:
                    indices.append(0)
                    if eos:
                        indices.append(dict["<eos>"])

            context_as_strings = [str(idx) for idx in indices]
            context_string = " ".join(context_as_strings)
            print("the context string is: " + context_string)

            #indices now contains our input sequence of integers. Run it through the encoder.
            indices = list(reversed(indices))
            indices_arr = np.array([indices])
            print("indices arr is " + str(indices_arr))
            #we set up this encoder to output a list containing the hidden and cell state

            enc_output, e_h, e_c = enc_model.predict(indices_arr)
            predicted_states = [e_h, e_c]
            print("encoder states have shape " + str(e_h.shape) + " and " +
                  str(e_c.shape))

            decoder_input = [dict["<sos>"]]
            output_string = []
            output_token = ""
            token_index = 0
            tokens = []

            decoder_array = np.array(decoder_input)
            print("dec_array has shape " + str(decoder_array.shape))

            while (r_dict[token_index] != "<eos>" and count < 15):

                input1_batch = [decoder_array]
                #input1_batches = [input1_batch1]
                #we have only 1 batch, and it has only 1 array in it
                input1_batch = np.array(input1_batch)
                print(input1_batch.shape)

                input2_batch = e_h
                #input2_batches = [input2_batch1]
                input2_batch = np.array(input2_batch)
                print(input2_batch.shape)

                input3_batch = e_c
                #input3_batches = [input3_batch1]
                input3_batch = np.array(input3_batch)
                print(input3_batch.shape)

                #print("predicting on "+str(np.array([[decoder_array]+predicted_states])))
                output_tokens, h, c = dec_model.predict(
                    [input1_batch, input2_batch, input3_batch])
                #this should be predicting on [ [array, h, c] ]
                #open the list, each item is a list of the inputs for that prediction
                #output_tokens should be a vector of dimension VOC_SIZE that has been softmaxed, so we need the argmax
                #print(str(output_tokens[0,-1,:]))
                token_index = np.argmax(output_tokens[0, -1, :], axis=0)
                decoder_input.append(token_index)
                #token_index+=3
                print("output word is " + str(token_index) + " ", end='')
                print("decoder input is now " + str(decoder_input))

                #print(r_dict[token_index])
                #print("\n")
                #output token is still an index at this point so we add its word to the predicted words so far

                output_string.append(r_dict[token_index])

                decoder_array = np.array(decoder_input)
                print("decoder_array is " + str(decoder_array))

                #update states for next iteration

                #predicted_states = [h, c]

                #print("new states for decoder are "+str(predicted_states))
                #so instead of feeding the extended sequence to the decoder each time, we just save its state
                #and do one word at a time
                count += 1

            output = " ".join(output_string)
            print("output is: " + output + "\n")
Exemplo n.º 27
0
import arrow

from helpers import read_data, write_data, get_settings
import api

settings = get_settings()
sync_dates = read_data('sync_dates')
last_sync = arrow.get(sync_dates['attachments'])
article_map = read_data('article_map')
attachment_map = read_data('attachment_map')
attachment_article_map = read_data('attachment_article_map')

for src_article in article_map:
    dst_article = article_map[src_article]
    print('\nGetting attachments in article {}...'.format(src_article))
    url = '{}/{}/articles/{}/attachments.json'.format(settings['src_root'],
                                                      settings['locale'],
                                                      src_article)
    attachments = api.get_resource_list(url,
                                        list_name='article_attachments',
                                        paginate=False)
    if not attachments:
        print('- no attachments found')
        continue
    for src_attachment in attachments:
        if last_sync < arrow.get(src_attachment['created_at']):
            print('- adding new attachment {} to article {}'.format(
                src_attachment['file_name'], dst_article))
            print(src_attachment)
            url = '{}/articles/{}/attachments.json'.format(
                settings['dst_root'], dst_article)