def generate_one_section(writer,
                         nodes=[30, 30, 30],
                         ep_range=[0, 101],
                         step=1,
                         suffix='',
                         bar=True):
    name = 0
    section_name = "{0}-{1} endpoints {2} nodes, step: {3}".format(
        ep_range[0], ep_range[1] - 1, nodes, step)
    endpoints = list(
        combinations(range(ep_range[0], ep_range[1], step), len(nodes)))
    total = len(endpoints)
    for comb in endpoints:
        # exclude cases where there are no endpoints at all
        if comb == (0, ) * len(comb):
            continue
        row_data = dict()
        row_data[field_names[0]] = str(name) + suffix
        for index, ep in enumerate(comb):
            row_data[field_names[index + 1]] = "{node} {endpoint}".format(
                node=nodes[index], endpoint=ep)
        writer.writerow(row_data)
        name += 1
        if bar:
            print_progress_bar(name, total, progress=section_name, length=70)
Пример #2
0
def extract_features(annotation, image_size=(64, 64)):
    n = len(annotation)
    for i, a in enumerate(annotation):
        print_progress_bar(i, n)
        image_path = a["image"]
        label = a["label"]
        image = cv2.imread(image_path)
        if image is None:
            continue
        image = cv2.resize(image, image_size, image)
        image_channels = cv2.split(image)
        for channel_idx, channel in enumerate(image_channels):
            np.copyto(net.blobs["data"].data[0, channel_idx, :, :], channel)

        # image = np.dstack(cv2.split(image))
        # np.copyto(net.blobs["data"].data, image)
        # net.blobs["data"].data = image
        output_blobs = net.forward(end="conv1", blobs=["conv1", ])
        channels_num = output_blobs["conv1"].shape[1]
        channels = [output_blobs["conv1"][0, i, :, :] for i in range(channels_num)]
        features = cv2.merge(channels)
        output_dir = join(args.features_dir, "positives" if label else "negatives")
        if not isdir(output_dir):
            mkdir(output_dir)
        feature_map_path = join(output_dir, splitext(basename(image_path))[0] + ".pkl")
        pkl.dump(features, file(feature_map_path, "w"))
    stop_progress_bar()
Пример #3
0
def process_images(topic_model, feature_model, filenames, args):
    """
    Process all the given files in the given root path using the
    pre-trained topic-model as well as the feature-model and return
    their transfer-values.
    
    The images are processed in batches to save memory and improve efficiency.
    """

    num_images = len(filenames)
    img_size = K.int_shape(feature_model.input)[
        1:3]  # Expected input size of the pre-trained network

    # Pre-allocate input-batch-array for images
    shape = (args.batch_size, ) + img_size + (3, )
    image_batch = np.zeros(shape=shape, dtype=np.float32)

    # Pre-allocate output-array for transfer-values.
    topic_transfer_values = np.zeros(shape=(num_images, ) +
                                     K.int_shape(topic_model.output)[1:],
                                     dtype=np.float32)
    feature_transfer_values = np.zeros(
        shape=(num_images, K.int_shape(feature_model.output)[1]),
        dtype=np.float32)

    start_index = 0
    print_progress_bar(start_index,
                       num_images)  # Initial call to print 0% progress

    while start_index < num_images:
        end_index = start_index + args.batch_size
        if end_index > num_images:
            end_index = num_images
        current_batch_size = end_index - start_index

        # Load all the images in the batch.
        for i, filename in enumerate(filenames[start_index:end_index]):
            path = os.path.join(args.root, filename)
            img = load_image(path, size=img_size, grayscale=False)
            image_batch[i] = img

        # Use the pre-trained models to process the image
        feature_transfer_values_batch = feature_model.predict(
            image_batch[0:current_batch_size])
        topic_transfer_values_batch = topic_model.predict(
            feature_transfer_values_batch)

        # Save the transfer-values in the pre-allocated arrays
        topic_transfer_values[
            start_index:end_index] = topic_transfer_values_batch[
                0:current_batch_size]
        feature_transfer_values[
            start_index:end_index] = feature_transfer_values_batch[
                0:current_batch_size]

        start_index = end_index
        print_progress_bar(start_index, num_images)  # Update Progress Bar

    print()
    return topic_transfer_values, feature_transfer_values
Пример #4
0
def evaluation_tree(tree,
                    X_test,
                    y_test,
                    print_on=True,
                    deleted_n=[],
                    test_index=[]):
    fx = []
    y = []

    if (print_on):
        utl.print_progress_bar(0, 50)

    if (len(test_index) == 0):
        test_index = [x for x in range(len(X_test))]

    count = 0
    for index in test_index:
        if (print_on):
            utl.print_progress_bar((count / (len(test_index) - 1)) * 100, 50)
            count = count + 1
        fx.append(tree.predict(X_test[index], deleted=deleted_n))
        y.append(y_test[index])

    rmse = err.rmse(y, fx)
    mape = err.mape(y, fx) * 100

    return rmse, mape
def quote_discrepancies(data, feature_names=[]):
    with open('data/external/apostrophe_words.txt', 'r') as f:
        apostrophe_words = list(
            map(lambda x: x.split(',')[0],
                f.read().splitlines()))

    vectors = []

    data_length = len(data)

    for i, entry in enumerate(data):
        entry = entry.lower()

        single_quote_apostrophes = sum(
            map(lambda t: entry.count(t), apostrophe_words))

        count_single = entry.count("\'") - single_quote_apostrophes
        count_double = entry.count("\"")

        vectors.append([float(min(count_single, count_double))])

        print_progress_bar(i + 1,
                           data_length,
                           description='quote_discrepancies')

    feature_names.extend(['quote_discrepancies'])

    return vectors
Пример #6
0
 def start(self, steps=50, batch_count=(20, 10), mb_start=0):
     start = time.time()
     losses = []
     count = 0
     for i in range(mb_start, steps):
         l, reg, debug = self.step()
         losses.append(l)
         suffix = ("| Current Loss %8.4f | "%l) if len(losses) != batch_count[0] else "| Average Loss %8.4f | " % \
                                                                                      (numpy.mean(losses))
         suffix += "reg %6.3f | time %6.0f ||"%(reg, time.time()-start)
         suffix += debug
         prefix = "Mini Batches %5d or %5.1f epochs"%(i+1, i*self.batch_size/self.train.kb.facts.shape[0])
         utils.print_progress_bar(len(losses), batch_count[0],prefix=prefix, suffix=suffix)
         if len(losses) >= batch_count[0]:
             losses = []
             count += 1
             if count == batch_count[1]:
                 self.scoring_function.eval()
                 valid_score = evaluate.evaluate("valid", self.ranker, self.valid.kb, self.eval_batch,
                                                 verbose=self.verbose, hooks=self.hooks)
                 test_score = evaluate.evaluate("test ", self.ranker, self.test.kb, self.eval_batch,
                                                verbose=self.verbose, hooks=self.hooks)
                 self.scoring_function.train()
                 count = 0
                 print()
                 self.save_state(i, valid_score, test_score)
     print()
     print("Ending")
     print(self.best_mrr_on_valid["valid"])
     print(self.best_mrr_on_valid["test"])
Пример #7
0
def cooc():
    """Computes GloVe cooccurrence matrix given a vocabulary and the pos. and neg. corpora.
    Entries in the cooccurrence matrix are weighted by the inverse of the distance of the two words.
    # Configs
        :dataset_version        - choose preprocessing
        :emb_dataset            - choose full or small dataset
        :emb_context_window     - context window size
        :emb_word_min_count     - minimum word count for a word to appear in vocab
    """
    if verbose > 0:
        print_header_str('COOCCURRENCES')
    if reuse_computed and os.path.isfile(vocab_dir+cooc_file+'.pkl'):
        if verbose > 0:
            print('Reusing cooccurrence matrix:', cooc_file)
            print_header_str('DONE')
            print()
        return

    with open(vocab_dir+vocab_file+'.pkl', 'rb') as f:
        vocab = pickle.load(f)

    cooc_dict = dict()
    counter = 0

    tot = (count_file_lines(tweet_dir + emb_train_tweets_pos) +
            count_file_lines(tweet_dir + emb_train_tweets_neg) +
            count_file_lines(tweet_dir + emb_test_tweets))
    
    if verbose == 1:
        print_progress_bar(0, tot, prefix = 'Building cooccurrence matrix:', suffix = 'Complete')

    for fn in [tweet_dir + emb_train_tweets_pos, tweet_dir + emb_train_tweets_neg, tweet_dir + emb_test_tweets]:
        with open(fn) as f:
            for line in f:

                # keeps tokens that are not in vocab for proper window construction
                tokens = [vocab.get(t, -1) for t in line.strip().split()]
                
                n = len(tokens)
                for i in range(n):
                    for j in range(max(0,i-emb_context_window),min(n,i+emb_context_window)):
                        if i != j and tokens[i] > 0 and tokens[j] > 0:
                            tok = (tokens[i],tokens[j])
                            cooc_dict[tok] = cooc_dict.get(tok,0)+1/abs(i-j)
                counter += 1
                if verbose == 1 and (counter % 5000 == 0 or counter == tot):
                    print_progress_bar(counter, tot, prefix = 'Building cooccurrence matrix:', suffix = 'Complete')
    
    data = list(cooc_dict.values())
    row = [k1 for k1,k2 in cooc_dict.keys()]
    col = [k2 for k1,k2 in cooc_dict.keys()]

    cooc = coo_matrix((data, (row, col)))

    with open(vocab_dir+cooc_file+'.pkl', 'wb') as f:
        pickle.dump(cooc, f, pickle.HIGHEST_PROTOCOL)
    if verbose > 0:
        print("{} nonzero entries.".format(cooc.nnz))
        print_header_str('DONE')
        print()
def apostrophe_discrepancies(data, feature_names=[]):
    with open('data/external/apostrophe_words.txt', 'r') as f:
        apostrophes = list(
            map(lambda x: tuple(x.split(',')),
                f.read().splitlines()))

    vectors = []

    data_length = len(data)

    for i, entry in enumerate(data):
        entry = entry.lower()

        local = list(
            map(lambda x: float(min(entry.count(x[0]), entry.count(x[1]))),
                apostrophes))

        vectors.append(local)

        print_progress_bar(i + 1,
                           data_length,
                           description='apostrophe_discrepancies')

    feature_names.extend([', '.join(a) for a in apostrophes])

    return vectors
Пример #9
0
def load_embeddings(model_manager, labels):
    database = get_database(model_manager)
    coords = database[EMBEDDINGS_COORDINATES_SET_NAME]

    utt_embs = FileArray(model_manager.files['utterance_embeddings'])
    utt_embs.open()

    embeddings = {}

    for progress, (d_idx, turn) in enumerate(labels):

        global_idx, conv_lengh = coords[d_idx]

        if d_idx not in embeddings:
            embeddings[d_idx] = []

        embeddings[d_idx].append((turn, utt_embs.read(global_idx + turn)))

        if progress % 100 == 0:
            print_progress_bar(progress,
                               len(labels),
                               additional_text='%i embeddings loaded' %
                               progress)

    for k, v in embeddings.iteritems():
        embeddings_turn_sorted = [
            pair[1] for pair in sorted(v, key=lambda p: p[0])
        ]
        embeddings[k] = embeddings_turn_sorted

    utt_embs.close()
    return embeddings
def min_max_lexical_per_sentence(data):
    transformed = []

    data_length = len(data)

    for index, entry in enumerate(data):
        sent_vector = []
        entry_sent = sent_tokenize(entry)

        for sent in entry_sent:
            entry_char = list(sent)
            entry_word = word_tokenize(sent)
            entry_word_tagged = pos_tag(entry_word)

            chars, char_features = lexical_chars(entry_char)
            words, word_features = lexical_words(entry_word_tagged)

            sent_vector.append(chars + words + [
                entry.count('?'),
                entry.count('.'),
                entry.count('!'),
                len(entry)
            ])

        min_v = np.amin(sent_vector, axis=0).tolist()
        max_v = np.amax(sent_vector, axis=0).tolist()
        transformed.append(np.subtract(max_v, min_v).tolist())

        print_progress_bar(index + 1,
                           data_length,
                           description='min_max_lexical_per_sentence')

    return transformed
def lexical(X, feature_names=[]):
    transformed = []

    for i, doc in enumerate(X):
        segments = []

        for entry in doc:
            entry_char = list(entry)
            entry_word = word_tokenize(entry)
            entry_word_tagged = pos_tag(entry_word)
            entry_sent = sent_tokenize(entry)

            chars, char_features = lexical_chars(entry_char)
            words, word_features = lexical_words(entry_word_tagged)
            sentences, sentence_features = lexical_sentences(entry_sent)
            consecutive_dots = [
                entry.count('..') + entry.count('...') + entry.count('....')
            ]

            segments.append(chars + words + sentences + consecutive_dots)

        transformed.append(segments)

        print_progress_bar(i + 1, len(X), description='lexical')

    feature_names.extend(char_features + word_features + sentence_features +
                         ['consecutive_dots'])

    return np.array(transformed)
Пример #12
0
def phrase_frequency(data,
                     word_gram_sizes,
                     stop_words,
                     use_mean,
                     feature_names=[]):
    vectors = []

    data_length = len(data)

    for i, entry in enumerate(data):
        words = word_tokenize(entry)

        if (stop_words):
            words = remove_stop_words(words)

        local = []
        for word_gram_size in word_gram_sizes:
            local.append(
                get_ordered_words_occurances(words, entry, word_gram_size,
                                             use_mean))

        vectors.append(local)

        print_progress_bar(i + 1, data_length, description='phrase_frequency')

    feature_names.extend([str(size) + 'gram' for size in word_gram_sizes])

    return vectors
Пример #13
0
def encode_categories(image_ids, image_categories, category_id, params):
    """ Replace all category names with their respective IDs and
        store them in a numpy array as a multi-hot vector.
    """

    categories = []

    # Initial call to print 0% progress
    print_progress_bar_counter = 0
    print_progress_bar(print_progress_bar_counter,
                       params['dataset_size'],
                       prefix='Progress:',
                       suffix='Complete',
                       length=50)

    for image_id in image_ids:
        one_hot = [0] * len(category_id)
        if params['single_label']:
            one_hot[category_id[random.choice(image_categories[image_id])]] = 1
        else:
            for category in image_categories[image_id]:
                one_hot[category_id[category]] = 1
        categories.append(one_hot)

        # Update Progress Bar
        print_progress_bar_counter += 1
        print_progress_bar(print_progress_bar_counter,
                           params['dataset_size'],
                           prefix='Progress:',
                           suffix='Complete',
                           length=50)

    return np.array(categories, dtype=np.float32)
Пример #14
0
def evaluate(name, ranker, kb, batch_size, verbose=0, top_count=5, hooks=None):
    """
    Evaluates an entity ranker on a knowledge base, by computing mean reverse rank, mean rank, hits 10 etc\n
    Can also print type prediction score with higher verbosity.\n
    :param name: A name that is displayed with this evaluation on the terminal
    :param ranker: The ranker that is used to rank the entites
    :param kb: The knowledge base to evaluate on. Must be augmented with type information when used with higher verbosity
    :param batch_size: The batch size of each minibatch
    :param verbose: The verbosity level. More info is displayed with higher verbosity
    :param top_count: The number of entities whose details are stored
    :param hooks: The additional hooks that need to be run with each mini-batch
    :return: A dict with the mrr, mr, hits10 and hits1 of the ranker on kb
    """
    if hooks is None:
        hooks = []
    totals = { "m":{"mrr":0, "mr":0, "hits10":0, "hits1":0}}
    start_time = time.time()
    if name == "train":
        facts = kb.facts[:50000]
    else:
        facts = kb.facts
    if(verbose>0):
        totals["correct_type"]={"e1":0, "e2":0}
        entity_type_matrix = kb.entity_type_matrix.cuda()
        for hook in hooks:
            hook.begin()
    for i in range(0, int(facts.shape[0]), batch_size):
        start = i
        end = min(i+batch_size, facts.shape[0])
        s = facts[start:end, 0]
        r = facts[start:end, 1]
        o = facts[start:end, 2]
        knowns_o = ranker.get_knowns(s, r)
        s = torch.autograd.Variable(torch.from_numpy(s).cuda(), requires_grad=False)
        r = torch.autograd.Variable(torch.from_numpy(r).cuda(), requires_grad=False)
        o = torch.autograd.Variable(torch.from_numpy(o).cuda(), requires_grad=False)
        knowns_o = torch.from_numpy(knowns_o).cuda()
        
        ranks_o, scores_o, score_of_expected_o = ranker.forward(s, r, o, knowns_o)
        #print(ranks_o)
        #e1,r,?
        totals['m']['mr'] += ranks_o.sum()
        totals['m']['mrr'] += (1.0/ranks_o).sum()
        totals['m']['hits10'] += ranks_o.le(11).float().sum()
        totals['m']['hits1'] += ranks_o.eq(1).float().sum()

        utils.print_progress_bar(end, facts.shape[0], "Eval on %s" % name, (("|M| mrr:%3.2f|h10:%3.2f%"
                                                                                  "%|h1:%3.2f|time %5.0f|") %
                                 (100.0*totals['m']['mrr']/end, 100.0*totals['m']['hits10']/end,
                                  100.0*totals['m']['hits1']/end, time.time()-start_time)), color="green")
    
    gc.collect()
    torch.cuda.empty_cache()
    for hook in hooks:
        hook.end()
    print(" ")
            
    totals['m'] = {x:totals['m'][x]/facts.shape[0] for x in totals['m']}

    return totals
Пример #15
0
def anagram(anag: str):
    """
    For all elements in all dictionaries, find words that contain any anagram of `anag` as a substring.

    For "non-consecutive anagrams" you just want a word bank: see wordbank.py.
    """
    found = []

    try:
        all_elems = utils.get_all_dicts()

        perms = perm_strs(anag)
        num_perms = len(perms)
        utils.print_progress_bar(0, num_perms)
        for i, perm in enumerate(perms):
            found.extend([elem for elem in all_elems if perm in elem])
            utils.print_progress_bar(i+1, num_perms)

    finally:
        print('found {} elems after containing an anagram'.format(len(found)))
        if found:
            utils.list_to_file(fname_anagram(anag), found)
            if len(found) < 100:
                for elem in found:
                    print('\t-', elem)
Пример #16
0
def encode_images(image_ids, image_file, params):
    """ Store images in a numpy array """

    images = []

    # Initial call to print 0% progress
    print_progress_bar_counter = 0
    print_progress_bar(print_progress_bar_counter,
                       params['dataset_size'],
                       prefix='Progress:',
                       suffix='Complete',
                       length=50)

    for image_id in image_ids:
        img_array = load_image(os.path.join(params['input_images'],
                                            image_file[image_id]),
                               size=(params['image_size'],
                                     params['image_size']),
                               grayscale=params['grayscale'])
        images.append(img_array)

        # Update Progress Bar
        print_progress_bar_counter += 1
        print_progress_bar(print_progress_bar_counter,
                           params['dataset_size'],
                           prefix='Progress:',
                           suffix='Complete',
                           length=50)

    return np.array(images, dtype=np.float32)
def split_points_count(data, words_left, words_right, window_words):
    vectors = []

    data_length = len(data)

    for i, entry in enumerate(data):
        entry = entry.lower()

        words = word_tokenize(entry)

        local = []

        index = 0
        while(index <= len(words) - window_words):
            summ = 0
            for word in words[index:index + window_words]:
                if(word in words_left): l = words_left[word]
                else: l = 0
                if(word in words_right): r = words_right[word]
                else: r = 0

                summ += max(l, r)
            index += window_words
            local.append(summ)

        vectors.append([max(local)])
        print_progress_bar(i + 1, data_length, description = 'split_points')

    return vectors
def processed_tags(X, feature_names=[]):
    transformed = []

    for i, doc in enumerate(X):
        segments = []

        for entry in doc:
            words = word_tokenize(entry)
            word_count = len(words)
            word_analysis = dict.fromkeys(preprocessor.tags, 0)

            for word in words:
                for tag in preprocessor.tags:
                    if word == tag:
                        word_analysis[tag] += 1

            segments.append([word_analysis[key]/word_count for key in preprocessor.tags])

        transformed.append(segments)

        print_progress_bar(i + 1, len(X), description = 'processed tags')

    feature_names.extend(preprocessor.tags)

    return np.array(transformed)
Пример #19
0
def extract_features(annotation, image_size=(64, 64)):
    n = len(annotation)
    for i, a in enumerate(annotation):
        print_progress_bar(i, n)
        image_path = a["image"]
        label = a["label"]
        image = cv2.imread(image_path)
        if image is None:
            continue
        image = cv2.resize(image, image_size, image)
        image_channels = cv2.split(image)
        for channel_idx, channel in enumerate(image_channels):
            np.copyto(net.blobs["data"].data[0, channel_idx, :, :], channel)

        # image = np.dstack(cv2.split(image))
        # np.copyto(net.blobs["data"].data, image)
        # net.blobs["data"].data = image
        output_blobs = net.forward(end="conv1", blobs=[
            "conv1",
        ])
        channels_num = output_blobs["conv1"].shape[1]
        channels = [
            output_blobs["conv1"][0, i, :, :] for i in range(channels_num)
        ]
        features = cv2.merge(channels)
        output_dir = join(args.features_dir,
                          "positives" if label else "negatives")
        if not isdir(output_dir):
            mkdir(output_dir)
        feature_map_path = join(output_dir,
                                splitext(basename(image_path))[0] + ".pkl")
        pkl.dump(features, file(feature_map_path, "w"))
    stop_progress_bar()
Пример #20
0
def wikisort_file(file: str):
    _, names = utils.file_to_list(file)
    scores = {}
    couldnt_find = []

    utils.print_progress_bar(0, len(names))
    for i, name in enumerate(names):
        try:
            scores[name] = views_per_month(name)
        except:
            # should probably keep track of the exceptions (so can tell if it's rate limiting etc.)
            couldnt_find.append(name)
        finally:
            utils.print_progress_bar(i + 1, len(names))

    print()
    print('---FAILED TO FIND---')
    print(couldnt_find)
    print('------')
    print()

    sort_by_views = [
        '{}\t{}'.format(k, v)
        for k, v in sorted(scores.items(), key=lambda x: x[1], reverse=True)
    ]

    utils.list_to_file(fname_ranked(file), sort_by_views, do_dedupe=False)
    def average_word_frequency(self, X, feature_names=[]):
        transformed = []

        for i, doc in enumerate(X):
            segments = []

            for entry in doc:
                class_sum = 0
                word_count = 0
                uncommon = 0
                entry = entry.lower()
                for w in word_tokenize(entry):
                    w = re.sub('[^a-zA-Z]+', '', w)
                    if not w: continue

                    word_count+=1
                    word_class = self.word_class.get(w, 20)
                    if word_class == 20:
                        uncommon += 1
                    class_sum += word_class

                segments.append([class_sum/word_count, uncommon/word_count])

            transformed.append(segments)
            print_progress_bar(i + 1, len(X), description = 'word frequency')

        feature_names.extend(['average_word_class', 'uncommon_words'])

        return transformed
Пример #22
0
def time_perp(main_table_df):
    out.info("Performing Watwin pre-processing...")
    # Watson(2013) doesn't state how they get mean and sd, we assume both mean and sd calculated from all compilation
    # pairs
    # Initialization:
    time_arr = {}
    mean_dict = {}
    std_dict = {}

    subjects = set(main_table_df["SubjectID"])
    timer_index = 1
    for subj in subjects:
        utils.print_progress_bar(timer_index, len(subjects))
        timer_index += 1

        current_df = main_table_df.loc[main_table_df["SubjectID"] == subj]
        current_df = current_df.sort_values(by=['Order'])
        compiles = current_df[current_df["EventType"] == "Compile"]
        compile_errors = current_df[current_df["EventType"] == "Compile.Error"]

        sum_time = 0
        count_time = 0

        if len(compiles) > 1:
            time_arr[subj] = {}
            for i in range(len(compiles) - 1):
                # Watson(2013) requires pair pruning, in which Remove identical pairs
                if compiles["CodeStateID"].iloc[i + 1] != compiles["CodeStateID"].iloc[i]:
                    e1_errors = compile_errors[compile_errors["ParentEventID"] == compiles["EventID"].iloc[i]]
                    e2_errors = compile_errors[compile_errors["ParentEventID"] == compiles["EventID"].iloc[i + 1]]
                    # If e1 compile resulted in error
                    if len(e1_errors) > 0:
                        # Watson(2013) requires time estimate preparation before calculating score, we assume no
                        # invocation reported in dataset, which means using time difference of compilcation pairs
                        # directly
                        datetimeFormat = '%Y-%m-%dT%H:%M:%S'
                        date1 = datetime.datetime.strptime(compiles["ServerTimestamp"].iloc[i + 1], datetimeFormat)
                        date2 = datetime.datetime.strptime(compiles["ServerTimestamp"].iloc[i], datetimeFormat)
                        time_diff = ((((date1.month - date2.month) * 30 + (date1.day - date2.day)) * 24 + (
                                    date1.hour - date2.hour)) * 60 + (date1.minute - date2.minute)) * 60 + (
                                                date1.second - date2.second)
                        sum_time += time_diff
                        count_time = count_time + 1
                        time_arr[subj][compiles["CodeStateID"].iloc[i]] = time_diff

        if count_time != 0:
            mean_time = sum_time / count_time
            mean_dict[subj] = mean_time
            std_time = np.std(np.asarray(list(time_arr[subj].values())))
            std_dict[subj] = std_time
        else:
            mean_time = 0
            mean_dict[subj] = mean_time
            std_time = 0
            std_dict[subj] = std_time

    out.info("Finished Watwin pre-processing...")
    return time_arr, mean_dict, std_dict
    def parse_records(self):
        """
        
        """
        # This is where the real parsing happens.
        trajectory = np.zeros([self.total_records, self.n_res], dtype=int)
        state_counts = np.zeros([self.total_records], dtype=int)
        energies = np.zeros([self.total_records], dtype=float)
        #print trajectory
        progress_counter = 0 # Ties into the progress bar that Kamran created in utils.py
        print_progress_bar(progress_counter, self.total_records) # The function called outside the 
        # for loop to initiate with an empty bar. 
        with open(self.ms_data_file, "rb") as ms:
            for index, record in enumerate(self.byte_indices): # enumerate returns an iterable that
                # provides coders with a counter along with the values that they wish to iterate
                # across. Hence enumerate(self.byte_indices) returns for each entry in the list, 
                # its index as one variable and its value as another. 
                ms.seek(record) # seek() basically goes to some value in the file. Here we go
                # the location specified by the current record value (which is in bytes, and is 
                # the beginning of a microstate). 
                bytes_conf_ids = ms.read(2 * self.n_res) # Starting from the record index, the 
                # conformer id is exactly 2 * self.n_res bytes long. This is largely due to the fact
                # that each microstate contains a conformer of each residue. 
                bytes_energies_1 = ms.read(8) # Read in the energy corresponding to that particular 
                # microstate. 
                ms.seek(ms.tell() + 8) # Skip ahead 8 bytes, because we do not care about the 
                # information located at that position.
                energy = struct.unpack("d", bytes_energies_1)[0] # This is where we will convert the
                # binary data to a double datatype. Note, a double is a float with much higher 
                # precision and range. Energy binary is converted to a decimal. 
                bytes_state_count = ms.read(4) # The remaining 4 bytes are stored into some value
                # unknown to me right now, but possibly relating to a positive number telling us 
                # the amount of times a microstate has occcured. 
                trajectory[index, :] = np.asarray(struct.unpack(str(self.n_res) + "H", bytes_conf_ids))
                # The bytes containing conformer ids are unpacked into small unsigned short datatype
                # segments. These are then stored in the row denoted by index, which is defined by the
                # for-loop. For this to work it is required that the product of np.asarray(...) has
                # the same amount of columns as the trajector, which is the number of residues. Hence
                # my hypothesis at this time is that the RHS contains conformer ids for each residue.

                #print(struct.unpack(str(self.n_res) + "H", bytes_conf_ids)[-2:])

                state_count = struct.unpack("i", bytes_state_count)[0] # Converts the binary data
                # of the state count to an integer.
                self.total_microstates += state_count # Class property total_microstate is increased
                # by the state count of the particular microstate the for-loop is currently on. 
                state_counts[index] += state_count # The value for microstate occurence in the current
                # record is recorded in the index we are on, for the list state_counts. 
                energies[index] += energy # Similiarly, the energy corresponding to that microstate
                # is also recorded.
                progress_counter += 1 # Update that a step has been completed. 
                print_progress_bar(progress_counter, self.total_records) # Update the progress bar
                # with a higher percentage complete value. 
        self.trajectory = trajectory # Make trajectory a class property.
        self.state_counts = state_counts # Make state_counts a class property.
        self.energies = energies # Make energies a class property.
Пример #24
0
def global_ngrams(X, vect, feature_names=[]):
    transformed = []

    for i, doc in enumerate(X):
        transformed.append(vect.transform(doc).toarray())
        print_progress_bar(i + 1, len(X), description = 'ngrams')

    feature_names.extend(vect.get_feature_names())

    return transformed
Пример #25
0
def calculate_weights(data, train_positions, inverse_scaling, half_sigmoid_sharpness, size):
    word_weights = {}
    word_counts = {}

    data_length = len(data)

    for i, (entry, positions) in enumerate(zip(data, train_positions)):
        entry = entry.lower()

        fragments = []
        positions.append(len(entry))

        entry_marker = 0

        for change in positions:
            fragments.append(entry[entry_marker:change])
            entry_marker = change

        for fragment in fragments:
            fragment = re.sub("[^a-zA-Z]+", " ", fragment)
            words = word_tokenize(fragment)

            fragment_length = len(words)

            for position, word in enumerate(words):
                if(size and position >= size and position <= fragment_length - size - 1):
                    continue

                word_weight = weight_half_sigmoid(position, fragment_length, half_sigmoid_sharpness)
                if word in word_weights:
                    word_weights[word].append(word_weight)
                    word_counts[word] = word_counts[word] + 1
                else:
                    word_weights[word] = [word_weight]
                    word_counts[word] = 1

        print_progress_bar(i + 1, data_length, description = 'split_points_weights')

    remove_entries(word_weights, stopwords.words('english'))
    remove_entries(word_counts, stopwords.words('english'))

    max_word_count = word_counts[max(word_counts.items(), key=itemgetter(1))[0]]
    min_word_count = word_counts[min(word_counts.items(), key=itemgetter(1))[0]]

    if(inverse_scaling):
        additional_weight = lambda k: float((word_counts[k] + 1 - min_word_count) / (max_word_count + 1 - min_word_count))

        word_weights = {k: (sum(v) / float(len(v))) * additional_weight(k) for k, v in word_weights.items()}
    else:
        word_weights = {k: sum(v) / float(len(v)) for k, v in word_weights.items()}

    for key, value in sorted(word_weights.items(), key = itemgetter(1), reverse = True)[:50]: print(key, value)

    return word_weights
Пример #26
0
    def get_word_tfidf(self, X, feature_names=[]):
        transformed = []
        vect = self.word_vect

        for i, doc in enumerate(X):
            transformed.append(vect.transform(doc).toarray())

            print_progress_bar(i + 1, len(X), description = 'tfidf')

        feature_names.extend(vect.get_feature_names())

        return transformed
Пример #27
0
def evaluate_generative(model_manager):
    rand_iter = random_response_generator(model_manager)
    encoder = model_manager.load_currently_selected_model()

    answer_model = get_response_generator(encoder)

    #translator = data_access.get_label_translator(model_manager)

    #evaluator = get_response_evaluator(model_manager.load_currently_selected_model())
    rankings = []
    start_time = time()

    result_arr = FileArray('./results/generative_results_%s.bin'%model_manager.model_name, shape=(1000000, 1), dtype='i4')
    result_arr.open()
    progress = 0

    print 'evaluating generative approach for', model_manager.model_name
    for instance in evaluation_sample_iterator(model_manager):
        progress += 1

        prev_result = result_arr.read(progress)

        if prev_result >= 1:
            progress += 1
            rankings.append(prev_result[0]-1)
            continue

        random_responses = [rand_iter.next() for x in xrange(9)]

        context = instance['context']

        cost, answer, pred_utt_emb = answer_model(context)

        candidates = [(cosine(pred_utt_emb, instance['answer_utterance_emb']), True)]
        for random_resp, rand_utt_emb in random_responses:
            cost = cosine(pred_utt_emb, rand_utt_emb)
            candidates.append((cost, False))

        candidates = sorted(candidates, key=lambda pair: pair[0])

        rank = [idx for idx, cand in enumerate(candidates) if candidates[idx][1]][0]
        rankings.append(rank)

        result_arr.write(progress-1, np.array([rank+1], dtype='i4'))


        rATk = calculate_recall_at_k(rankings, 10)
        result_str = ' | '.join(['R@%i %.3f%%' % (k + 1, percentage * 100) for k, percentage in rATk.iteritems()])

        print_progress_bar(instance['progress'], instance['conversations'], additional_text=result_str,
                           start_time=start_time)

    result_arr.close()
Пример #28
0
    def get_function_words(self, X, feature_names=[]):
        transformed = []
        vect = self.function_word_vect

        for i, doc in enumerate(X):
            segments = [self.only_function_words(s) for s in doc]
            transformed.append(vect.transform(segments).toarray())

            print_progress_bar(i + 1, len(X), description = 'function words')

        feature_names.extend(vect.get_feature_names())

        return transformed
Пример #29
0
    def compute_charcnn_embed(self, batch_size=200):
        ent_cnt = len(self.train.kb.datamap.entity_map)

        print("Precomputing charCNN embeddings")
        with torch.no_grad():
            for i in range(0, ent_cnt, batch_size):
                utils.print_progress_bar(i, ent_cnt)
                inp = self.train.kb.charcnn_packaged(
                    [numpy.arange(i, min(i + batch_size, ent_cnt))])
                self.scoring_function.compute_char_embeddings(
                    i, i + batch_size, inp[0])

        print("charCNN embeddings computed")
def calculate_weights_count(data, train_positions, inverse_scaling, half_sigmoid_sharpness, size):
    words_left = {}
    words_right = {}
    words_global = {}

    data_length = len(data)

    for i, (entry, positions) in enumerate(zip(data, train_positions)):
        entry = entry.lower()

        fragments = []
        positions.append(len(entry))

        entry_marker = 0

        for change in positions:
            fragments.append(entry[entry_marker:change])
            entry_marker = change

        for fragment in fragments:
            fragment = re.sub("[^a-zA-Z]+", " ", fragment)
            words = word_tokenize(fragment)
            left = words[:size]
            right = words[-size:]

            for word in words:
                if word in words_global: words_global[word] += 1
                else: words_global[word] = 1

            for word in left:
                if word in words_left: words_left[word] = words_left[word] + 1
                else: words_left[word] = 1

            for word in right:
                if word in words_right: words_right[word] = words_right[word] + 1
                else: words_right[word] = 1

        print_progress_bar(i + 1, data_length, description = 'split_points_weights')

    remove_entries(words_left, stopwords.words('english'))
    remove_entries(words_right, stopwords.words('english'))

    words_left = min_max_dict(words_left)
    words_right = min_max_dict(words_right)

    for key, value in sorted(words_left.items(), key = itemgetter(1), reverse = True)[:50]: print(key, value)
    print('====================================================')
    for key, value in sorted(words_right.items(), key = itemgetter(1), reverse = True)[:50]: print(key, value)

    return words_left, words_right
def num_paragraphs(data, feature_names=[]):
    vectors = []

    data_length = len(data)

    for i, entry in enumerate(data):
        vectors.append([float(entry.count('\n') + 1)])

        print_progress_bar(i + 1, data_length, description = 'num_paragraphs')
    
    
    feature_names.extend(['num_paragraphs'])

    return vectors