def generate_one_section(writer, nodes=[30, 30, 30], ep_range=[0, 101], step=1, suffix='', bar=True): name = 0 section_name = "{0}-{1} endpoints {2} nodes, step: {3}".format( ep_range[0], ep_range[1] - 1, nodes, step) endpoints = list( combinations(range(ep_range[0], ep_range[1], step), len(nodes))) total = len(endpoints) for comb in endpoints: # exclude cases where there are no endpoints at all if comb == (0, ) * len(comb): continue row_data = dict() row_data[field_names[0]] = str(name) + suffix for index, ep in enumerate(comb): row_data[field_names[index + 1]] = "{node} {endpoint}".format( node=nodes[index], endpoint=ep) writer.writerow(row_data) name += 1 if bar: print_progress_bar(name, total, progress=section_name, length=70)
def extract_features(annotation, image_size=(64, 64)): n = len(annotation) for i, a in enumerate(annotation): print_progress_bar(i, n) image_path = a["image"] label = a["label"] image = cv2.imread(image_path) if image is None: continue image = cv2.resize(image, image_size, image) image_channels = cv2.split(image) for channel_idx, channel in enumerate(image_channels): np.copyto(net.blobs["data"].data[0, channel_idx, :, :], channel) # image = np.dstack(cv2.split(image)) # np.copyto(net.blobs["data"].data, image) # net.blobs["data"].data = image output_blobs = net.forward(end="conv1", blobs=["conv1", ]) channels_num = output_blobs["conv1"].shape[1] channels = [output_blobs["conv1"][0, i, :, :] for i in range(channels_num)] features = cv2.merge(channels) output_dir = join(args.features_dir, "positives" if label else "negatives") if not isdir(output_dir): mkdir(output_dir) feature_map_path = join(output_dir, splitext(basename(image_path))[0] + ".pkl") pkl.dump(features, file(feature_map_path, "w")) stop_progress_bar()
def process_images(topic_model, feature_model, filenames, args): """ Process all the given files in the given root path using the pre-trained topic-model as well as the feature-model and return their transfer-values. The images are processed in batches to save memory and improve efficiency. """ num_images = len(filenames) img_size = K.int_shape(feature_model.input)[ 1:3] # Expected input size of the pre-trained network # Pre-allocate input-batch-array for images shape = (args.batch_size, ) + img_size + (3, ) image_batch = np.zeros(shape=shape, dtype=np.float32) # Pre-allocate output-array for transfer-values. topic_transfer_values = np.zeros(shape=(num_images, ) + K.int_shape(topic_model.output)[1:], dtype=np.float32) feature_transfer_values = np.zeros( shape=(num_images, K.int_shape(feature_model.output)[1]), dtype=np.float32) start_index = 0 print_progress_bar(start_index, num_images) # Initial call to print 0% progress while start_index < num_images: end_index = start_index + args.batch_size if end_index > num_images: end_index = num_images current_batch_size = end_index - start_index # Load all the images in the batch. for i, filename in enumerate(filenames[start_index:end_index]): path = os.path.join(args.root, filename) img = load_image(path, size=img_size, grayscale=False) image_batch[i] = img # Use the pre-trained models to process the image feature_transfer_values_batch = feature_model.predict( image_batch[0:current_batch_size]) topic_transfer_values_batch = topic_model.predict( feature_transfer_values_batch) # Save the transfer-values in the pre-allocated arrays topic_transfer_values[ start_index:end_index] = topic_transfer_values_batch[ 0:current_batch_size] feature_transfer_values[ start_index:end_index] = feature_transfer_values_batch[ 0:current_batch_size] start_index = end_index print_progress_bar(start_index, num_images) # Update Progress Bar print() return topic_transfer_values, feature_transfer_values
def evaluation_tree(tree, X_test, y_test, print_on=True, deleted_n=[], test_index=[]): fx = [] y = [] if (print_on): utl.print_progress_bar(0, 50) if (len(test_index) == 0): test_index = [x for x in range(len(X_test))] count = 0 for index in test_index: if (print_on): utl.print_progress_bar((count / (len(test_index) - 1)) * 100, 50) count = count + 1 fx.append(tree.predict(X_test[index], deleted=deleted_n)) y.append(y_test[index]) rmse = err.rmse(y, fx) mape = err.mape(y, fx) * 100 return rmse, mape
def quote_discrepancies(data, feature_names=[]): with open('data/external/apostrophe_words.txt', 'r') as f: apostrophe_words = list( map(lambda x: x.split(',')[0], f.read().splitlines())) vectors = [] data_length = len(data) for i, entry in enumerate(data): entry = entry.lower() single_quote_apostrophes = sum( map(lambda t: entry.count(t), apostrophe_words)) count_single = entry.count("\'") - single_quote_apostrophes count_double = entry.count("\"") vectors.append([float(min(count_single, count_double))]) print_progress_bar(i + 1, data_length, description='quote_discrepancies') feature_names.extend(['quote_discrepancies']) return vectors
def start(self, steps=50, batch_count=(20, 10), mb_start=0): start = time.time() losses = [] count = 0 for i in range(mb_start, steps): l, reg, debug = self.step() losses.append(l) suffix = ("| Current Loss %8.4f | "%l) if len(losses) != batch_count[0] else "| Average Loss %8.4f | " % \ (numpy.mean(losses)) suffix += "reg %6.3f | time %6.0f ||"%(reg, time.time()-start) suffix += debug prefix = "Mini Batches %5d or %5.1f epochs"%(i+1, i*self.batch_size/self.train.kb.facts.shape[0]) utils.print_progress_bar(len(losses), batch_count[0],prefix=prefix, suffix=suffix) if len(losses) >= batch_count[0]: losses = [] count += 1 if count == batch_count[1]: self.scoring_function.eval() valid_score = evaluate.evaluate("valid", self.ranker, self.valid.kb, self.eval_batch, verbose=self.verbose, hooks=self.hooks) test_score = evaluate.evaluate("test ", self.ranker, self.test.kb, self.eval_batch, verbose=self.verbose, hooks=self.hooks) self.scoring_function.train() count = 0 print() self.save_state(i, valid_score, test_score) print() print("Ending") print(self.best_mrr_on_valid["valid"]) print(self.best_mrr_on_valid["test"])
def cooc(): """Computes GloVe cooccurrence matrix given a vocabulary and the pos. and neg. corpora. Entries in the cooccurrence matrix are weighted by the inverse of the distance of the two words. # Configs :dataset_version - choose preprocessing :emb_dataset - choose full or small dataset :emb_context_window - context window size :emb_word_min_count - minimum word count for a word to appear in vocab """ if verbose > 0: print_header_str('COOCCURRENCES') if reuse_computed and os.path.isfile(vocab_dir+cooc_file+'.pkl'): if verbose > 0: print('Reusing cooccurrence matrix:', cooc_file) print_header_str('DONE') print() return with open(vocab_dir+vocab_file+'.pkl', 'rb') as f: vocab = pickle.load(f) cooc_dict = dict() counter = 0 tot = (count_file_lines(tweet_dir + emb_train_tweets_pos) + count_file_lines(tweet_dir + emb_train_tweets_neg) + count_file_lines(tweet_dir + emb_test_tweets)) if verbose == 1: print_progress_bar(0, tot, prefix = 'Building cooccurrence matrix:', suffix = 'Complete') for fn in [tweet_dir + emb_train_tweets_pos, tweet_dir + emb_train_tweets_neg, tweet_dir + emb_test_tweets]: with open(fn) as f: for line in f: # keeps tokens that are not in vocab for proper window construction tokens = [vocab.get(t, -1) for t in line.strip().split()] n = len(tokens) for i in range(n): for j in range(max(0,i-emb_context_window),min(n,i+emb_context_window)): if i != j and tokens[i] > 0 and tokens[j] > 0: tok = (tokens[i],tokens[j]) cooc_dict[tok] = cooc_dict.get(tok,0)+1/abs(i-j) counter += 1 if verbose == 1 and (counter % 5000 == 0 or counter == tot): print_progress_bar(counter, tot, prefix = 'Building cooccurrence matrix:', suffix = 'Complete') data = list(cooc_dict.values()) row = [k1 for k1,k2 in cooc_dict.keys()] col = [k2 for k1,k2 in cooc_dict.keys()] cooc = coo_matrix((data, (row, col))) with open(vocab_dir+cooc_file+'.pkl', 'wb') as f: pickle.dump(cooc, f, pickle.HIGHEST_PROTOCOL) if verbose > 0: print("{} nonzero entries.".format(cooc.nnz)) print_header_str('DONE') print()
def apostrophe_discrepancies(data, feature_names=[]): with open('data/external/apostrophe_words.txt', 'r') as f: apostrophes = list( map(lambda x: tuple(x.split(',')), f.read().splitlines())) vectors = [] data_length = len(data) for i, entry in enumerate(data): entry = entry.lower() local = list( map(lambda x: float(min(entry.count(x[0]), entry.count(x[1]))), apostrophes)) vectors.append(local) print_progress_bar(i + 1, data_length, description='apostrophe_discrepancies') feature_names.extend([', '.join(a) for a in apostrophes]) return vectors
def load_embeddings(model_manager, labels): database = get_database(model_manager) coords = database[EMBEDDINGS_COORDINATES_SET_NAME] utt_embs = FileArray(model_manager.files['utterance_embeddings']) utt_embs.open() embeddings = {} for progress, (d_idx, turn) in enumerate(labels): global_idx, conv_lengh = coords[d_idx] if d_idx not in embeddings: embeddings[d_idx] = [] embeddings[d_idx].append((turn, utt_embs.read(global_idx + turn))) if progress % 100 == 0: print_progress_bar(progress, len(labels), additional_text='%i embeddings loaded' % progress) for k, v in embeddings.iteritems(): embeddings_turn_sorted = [ pair[1] for pair in sorted(v, key=lambda p: p[0]) ] embeddings[k] = embeddings_turn_sorted utt_embs.close() return embeddings
def min_max_lexical_per_sentence(data): transformed = [] data_length = len(data) for index, entry in enumerate(data): sent_vector = [] entry_sent = sent_tokenize(entry) for sent in entry_sent: entry_char = list(sent) entry_word = word_tokenize(sent) entry_word_tagged = pos_tag(entry_word) chars, char_features = lexical_chars(entry_char) words, word_features = lexical_words(entry_word_tagged) sent_vector.append(chars + words + [ entry.count('?'), entry.count('.'), entry.count('!'), len(entry) ]) min_v = np.amin(sent_vector, axis=0).tolist() max_v = np.amax(sent_vector, axis=0).tolist() transformed.append(np.subtract(max_v, min_v).tolist()) print_progress_bar(index + 1, data_length, description='min_max_lexical_per_sentence') return transformed
def lexical(X, feature_names=[]): transformed = [] for i, doc in enumerate(X): segments = [] for entry in doc: entry_char = list(entry) entry_word = word_tokenize(entry) entry_word_tagged = pos_tag(entry_word) entry_sent = sent_tokenize(entry) chars, char_features = lexical_chars(entry_char) words, word_features = lexical_words(entry_word_tagged) sentences, sentence_features = lexical_sentences(entry_sent) consecutive_dots = [ entry.count('..') + entry.count('...') + entry.count('....') ] segments.append(chars + words + sentences + consecutive_dots) transformed.append(segments) print_progress_bar(i + 1, len(X), description='lexical') feature_names.extend(char_features + word_features + sentence_features + ['consecutive_dots']) return np.array(transformed)
def phrase_frequency(data, word_gram_sizes, stop_words, use_mean, feature_names=[]): vectors = [] data_length = len(data) for i, entry in enumerate(data): words = word_tokenize(entry) if (stop_words): words = remove_stop_words(words) local = [] for word_gram_size in word_gram_sizes: local.append( get_ordered_words_occurances(words, entry, word_gram_size, use_mean)) vectors.append(local) print_progress_bar(i + 1, data_length, description='phrase_frequency') feature_names.extend([str(size) + 'gram' for size in word_gram_sizes]) return vectors
def encode_categories(image_ids, image_categories, category_id, params): """ Replace all category names with their respective IDs and store them in a numpy array as a multi-hot vector. """ categories = [] # Initial call to print 0% progress print_progress_bar_counter = 0 print_progress_bar(print_progress_bar_counter, params['dataset_size'], prefix='Progress:', suffix='Complete', length=50) for image_id in image_ids: one_hot = [0] * len(category_id) if params['single_label']: one_hot[category_id[random.choice(image_categories[image_id])]] = 1 else: for category in image_categories[image_id]: one_hot[category_id[category]] = 1 categories.append(one_hot) # Update Progress Bar print_progress_bar_counter += 1 print_progress_bar(print_progress_bar_counter, params['dataset_size'], prefix='Progress:', suffix='Complete', length=50) return np.array(categories, dtype=np.float32)
def evaluate(name, ranker, kb, batch_size, verbose=0, top_count=5, hooks=None): """ Evaluates an entity ranker on a knowledge base, by computing mean reverse rank, mean rank, hits 10 etc\n Can also print type prediction score with higher verbosity.\n :param name: A name that is displayed with this evaluation on the terminal :param ranker: The ranker that is used to rank the entites :param kb: The knowledge base to evaluate on. Must be augmented with type information when used with higher verbosity :param batch_size: The batch size of each minibatch :param verbose: The verbosity level. More info is displayed with higher verbosity :param top_count: The number of entities whose details are stored :param hooks: The additional hooks that need to be run with each mini-batch :return: A dict with the mrr, mr, hits10 and hits1 of the ranker on kb """ if hooks is None: hooks = [] totals = { "m":{"mrr":0, "mr":0, "hits10":0, "hits1":0}} start_time = time.time() if name == "train": facts = kb.facts[:50000] else: facts = kb.facts if(verbose>0): totals["correct_type"]={"e1":0, "e2":0} entity_type_matrix = kb.entity_type_matrix.cuda() for hook in hooks: hook.begin() for i in range(0, int(facts.shape[0]), batch_size): start = i end = min(i+batch_size, facts.shape[0]) s = facts[start:end, 0] r = facts[start:end, 1] o = facts[start:end, 2] knowns_o = ranker.get_knowns(s, r) s = torch.autograd.Variable(torch.from_numpy(s).cuda(), requires_grad=False) r = torch.autograd.Variable(torch.from_numpy(r).cuda(), requires_grad=False) o = torch.autograd.Variable(torch.from_numpy(o).cuda(), requires_grad=False) knowns_o = torch.from_numpy(knowns_o).cuda() ranks_o, scores_o, score_of_expected_o = ranker.forward(s, r, o, knowns_o) #print(ranks_o) #e1,r,? totals['m']['mr'] += ranks_o.sum() totals['m']['mrr'] += (1.0/ranks_o).sum() totals['m']['hits10'] += ranks_o.le(11).float().sum() totals['m']['hits1'] += ranks_o.eq(1).float().sum() utils.print_progress_bar(end, facts.shape[0], "Eval on %s" % name, (("|M| mrr:%3.2f|h10:%3.2f%" "%|h1:%3.2f|time %5.0f|") % (100.0*totals['m']['mrr']/end, 100.0*totals['m']['hits10']/end, 100.0*totals['m']['hits1']/end, time.time()-start_time)), color="green") gc.collect() torch.cuda.empty_cache() for hook in hooks: hook.end() print(" ") totals['m'] = {x:totals['m'][x]/facts.shape[0] for x in totals['m']} return totals
def anagram(anag: str): """ For all elements in all dictionaries, find words that contain any anagram of `anag` as a substring. For "non-consecutive anagrams" you just want a word bank: see wordbank.py. """ found = [] try: all_elems = utils.get_all_dicts() perms = perm_strs(anag) num_perms = len(perms) utils.print_progress_bar(0, num_perms) for i, perm in enumerate(perms): found.extend([elem for elem in all_elems if perm in elem]) utils.print_progress_bar(i+1, num_perms) finally: print('found {} elems after containing an anagram'.format(len(found))) if found: utils.list_to_file(fname_anagram(anag), found) if len(found) < 100: for elem in found: print('\t-', elem)
def encode_images(image_ids, image_file, params): """ Store images in a numpy array """ images = [] # Initial call to print 0% progress print_progress_bar_counter = 0 print_progress_bar(print_progress_bar_counter, params['dataset_size'], prefix='Progress:', suffix='Complete', length=50) for image_id in image_ids: img_array = load_image(os.path.join(params['input_images'], image_file[image_id]), size=(params['image_size'], params['image_size']), grayscale=params['grayscale']) images.append(img_array) # Update Progress Bar print_progress_bar_counter += 1 print_progress_bar(print_progress_bar_counter, params['dataset_size'], prefix='Progress:', suffix='Complete', length=50) return np.array(images, dtype=np.float32)
def split_points_count(data, words_left, words_right, window_words): vectors = [] data_length = len(data) for i, entry in enumerate(data): entry = entry.lower() words = word_tokenize(entry) local = [] index = 0 while(index <= len(words) - window_words): summ = 0 for word in words[index:index + window_words]: if(word in words_left): l = words_left[word] else: l = 0 if(word in words_right): r = words_right[word] else: r = 0 summ += max(l, r) index += window_words local.append(summ) vectors.append([max(local)]) print_progress_bar(i + 1, data_length, description = 'split_points') return vectors
def processed_tags(X, feature_names=[]): transformed = [] for i, doc in enumerate(X): segments = [] for entry in doc: words = word_tokenize(entry) word_count = len(words) word_analysis = dict.fromkeys(preprocessor.tags, 0) for word in words: for tag in preprocessor.tags: if word == tag: word_analysis[tag] += 1 segments.append([word_analysis[key]/word_count for key in preprocessor.tags]) transformed.append(segments) print_progress_bar(i + 1, len(X), description = 'processed tags') feature_names.extend(preprocessor.tags) return np.array(transformed)
def extract_features(annotation, image_size=(64, 64)): n = len(annotation) for i, a in enumerate(annotation): print_progress_bar(i, n) image_path = a["image"] label = a["label"] image = cv2.imread(image_path) if image is None: continue image = cv2.resize(image, image_size, image) image_channels = cv2.split(image) for channel_idx, channel in enumerate(image_channels): np.copyto(net.blobs["data"].data[0, channel_idx, :, :], channel) # image = np.dstack(cv2.split(image)) # np.copyto(net.blobs["data"].data, image) # net.blobs["data"].data = image output_blobs = net.forward(end="conv1", blobs=[ "conv1", ]) channels_num = output_blobs["conv1"].shape[1] channels = [ output_blobs["conv1"][0, i, :, :] for i in range(channels_num) ] features = cv2.merge(channels) output_dir = join(args.features_dir, "positives" if label else "negatives") if not isdir(output_dir): mkdir(output_dir) feature_map_path = join(output_dir, splitext(basename(image_path))[0] + ".pkl") pkl.dump(features, file(feature_map_path, "w")) stop_progress_bar()
def wikisort_file(file: str): _, names = utils.file_to_list(file) scores = {} couldnt_find = [] utils.print_progress_bar(0, len(names)) for i, name in enumerate(names): try: scores[name] = views_per_month(name) except: # should probably keep track of the exceptions (so can tell if it's rate limiting etc.) couldnt_find.append(name) finally: utils.print_progress_bar(i + 1, len(names)) print() print('---FAILED TO FIND---') print(couldnt_find) print('------') print() sort_by_views = [ '{}\t{}'.format(k, v) for k, v in sorted(scores.items(), key=lambda x: x[1], reverse=True) ] utils.list_to_file(fname_ranked(file), sort_by_views, do_dedupe=False)
def average_word_frequency(self, X, feature_names=[]): transformed = [] for i, doc in enumerate(X): segments = [] for entry in doc: class_sum = 0 word_count = 0 uncommon = 0 entry = entry.lower() for w in word_tokenize(entry): w = re.sub('[^a-zA-Z]+', '', w) if not w: continue word_count+=1 word_class = self.word_class.get(w, 20) if word_class == 20: uncommon += 1 class_sum += word_class segments.append([class_sum/word_count, uncommon/word_count]) transformed.append(segments) print_progress_bar(i + 1, len(X), description = 'word frequency') feature_names.extend(['average_word_class', 'uncommon_words']) return transformed
def time_perp(main_table_df): out.info("Performing Watwin pre-processing...") # Watson(2013) doesn't state how they get mean and sd, we assume both mean and sd calculated from all compilation # pairs # Initialization: time_arr = {} mean_dict = {} std_dict = {} subjects = set(main_table_df["SubjectID"]) timer_index = 1 for subj in subjects: utils.print_progress_bar(timer_index, len(subjects)) timer_index += 1 current_df = main_table_df.loc[main_table_df["SubjectID"] == subj] current_df = current_df.sort_values(by=['Order']) compiles = current_df[current_df["EventType"] == "Compile"] compile_errors = current_df[current_df["EventType"] == "Compile.Error"] sum_time = 0 count_time = 0 if len(compiles) > 1: time_arr[subj] = {} for i in range(len(compiles) - 1): # Watson(2013) requires pair pruning, in which Remove identical pairs if compiles["CodeStateID"].iloc[i + 1] != compiles["CodeStateID"].iloc[i]: e1_errors = compile_errors[compile_errors["ParentEventID"] == compiles["EventID"].iloc[i]] e2_errors = compile_errors[compile_errors["ParentEventID"] == compiles["EventID"].iloc[i + 1]] # If e1 compile resulted in error if len(e1_errors) > 0: # Watson(2013) requires time estimate preparation before calculating score, we assume no # invocation reported in dataset, which means using time difference of compilcation pairs # directly datetimeFormat = '%Y-%m-%dT%H:%M:%S' date1 = datetime.datetime.strptime(compiles["ServerTimestamp"].iloc[i + 1], datetimeFormat) date2 = datetime.datetime.strptime(compiles["ServerTimestamp"].iloc[i], datetimeFormat) time_diff = ((((date1.month - date2.month) * 30 + (date1.day - date2.day)) * 24 + ( date1.hour - date2.hour)) * 60 + (date1.minute - date2.minute)) * 60 + ( date1.second - date2.second) sum_time += time_diff count_time = count_time + 1 time_arr[subj][compiles["CodeStateID"].iloc[i]] = time_diff if count_time != 0: mean_time = sum_time / count_time mean_dict[subj] = mean_time std_time = np.std(np.asarray(list(time_arr[subj].values()))) std_dict[subj] = std_time else: mean_time = 0 mean_dict[subj] = mean_time std_time = 0 std_dict[subj] = std_time out.info("Finished Watwin pre-processing...") return time_arr, mean_dict, std_dict
def parse_records(self): """ """ # This is where the real parsing happens. trajectory = np.zeros([self.total_records, self.n_res], dtype=int) state_counts = np.zeros([self.total_records], dtype=int) energies = np.zeros([self.total_records], dtype=float) #print trajectory progress_counter = 0 # Ties into the progress bar that Kamran created in utils.py print_progress_bar(progress_counter, self.total_records) # The function called outside the # for loop to initiate with an empty bar. with open(self.ms_data_file, "rb") as ms: for index, record in enumerate(self.byte_indices): # enumerate returns an iterable that # provides coders with a counter along with the values that they wish to iterate # across. Hence enumerate(self.byte_indices) returns for each entry in the list, # its index as one variable and its value as another. ms.seek(record) # seek() basically goes to some value in the file. Here we go # the location specified by the current record value (which is in bytes, and is # the beginning of a microstate). bytes_conf_ids = ms.read(2 * self.n_res) # Starting from the record index, the # conformer id is exactly 2 * self.n_res bytes long. This is largely due to the fact # that each microstate contains a conformer of each residue. bytes_energies_1 = ms.read(8) # Read in the energy corresponding to that particular # microstate. ms.seek(ms.tell() + 8) # Skip ahead 8 bytes, because we do not care about the # information located at that position. energy = struct.unpack("d", bytes_energies_1)[0] # This is where we will convert the # binary data to a double datatype. Note, a double is a float with much higher # precision and range. Energy binary is converted to a decimal. bytes_state_count = ms.read(4) # The remaining 4 bytes are stored into some value # unknown to me right now, but possibly relating to a positive number telling us # the amount of times a microstate has occcured. trajectory[index, :] = np.asarray(struct.unpack(str(self.n_res) + "H", bytes_conf_ids)) # The bytes containing conformer ids are unpacked into small unsigned short datatype # segments. These are then stored in the row denoted by index, which is defined by the # for-loop. For this to work it is required that the product of np.asarray(...) has # the same amount of columns as the trajector, which is the number of residues. Hence # my hypothesis at this time is that the RHS contains conformer ids for each residue. #print(struct.unpack(str(self.n_res) + "H", bytes_conf_ids)[-2:]) state_count = struct.unpack("i", bytes_state_count)[0] # Converts the binary data # of the state count to an integer. self.total_microstates += state_count # Class property total_microstate is increased # by the state count of the particular microstate the for-loop is currently on. state_counts[index] += state_count # The value for microstate occurence in the current # record is recorded in the index we are on, for the list state_counts. energies[index] += energy # Similiarly, the energy corresponding to that microstate # is also recorded. progress_counter += 1 # Update that a step has been completed. print_progress_bar(progress_counter, self.total_records) # Update the progress bar # with a higher percentage complete value. self.trajectory = trajectory # Make trajectory a class property. self.state_counts = state_counts # Make state_counts a class property. self.energies = energies # Make energies a class property.
def global_ngrams(X, vect, feature_names=[]): transformed = [] for i, doc in enumerate(X): transformed.append(vect.transform(doc).toarray()) print_progress_bar(i + 1, len(X), description = 'ngrams') feature_names.extend(vect.get_feature_names()) return transformed
def calculate_weights(data, train_positions, inverse_scaling, half_sigmoid_sharpness, size): word_weights = {} word_counts = {} data_length = len(data) for i, (entry, positions) in enumerate(zip(data, train_positions)): entry = entry.lower() fragments = [] positions.append(len(entry)) entry_marker = 0 for change in positions: fragments.append(entry[entry_marker:change]) entry_marker = change for fragment in fragments: fragment = re.sub("[^a-zA-Z]+", " ", fragment) words = word_tokenize(fragment) fragment_length = len(words) for position, word in enumerate(words): if(size and position >= size and position <= fragment_length - size - 1): continue word_weight = weight_half_sigmoid(position, fragment_length, half_sigmoid_sharpness) if word in word_weights: word_weights[word].append(word_weight) word_counts[word] = word_counts[word] + 1 else: word_weights[word] = [word_weight] word_counts[word] = 1 print_progress_bar(i + 1, data_length, description = 'split_points_weights') remove_entries(word_weights, stopwords.words('english')) remove_entries(word_counts, stopwords.words('english')) max_word_count = word_counts[max(word_counts.items(), key=itemgetter(1))[0]] min_word_count = word_counts[min(word_counts.items(), key=itemgetter(1))[0]] if(inverse_scaling): additional_weight = lambda k: float((word_counts[k] + 1 - min_word_count) / (max_word_count + 1 - min_word_count)) word_weights = {k: (sum(v) / float(len(v))) * additional_weight(k) for k, v in word_weights.items()} else: word_weights = {k: sum(v) / float(len(v)) for k, v in word_weights.items()} for key, value in sorted(word_weights.items(), key = itemgetter(1), reverse = True)[:50]: print(key, value) return word_weights
def get_word_tfidf(self, X, feature_names=[]): transformed = [] vect = self.word_vect for i, doc in enumerate(X): transformed.append(vect.transform(doc).toarray()) print_progress_bar(i + 1, len(X), description = 'tfidf') feature_names.extend(vect.get_feature_names()) return transformed
def evaluate_generative(model_manager): rand_iter = random_response_generator(model_manager) encoder = model_manager.load_currently_selected_model() answer_model = get_response_generator(encoder) #translator = data_access.get_label_translator(model_manager) #evaluator = get_response_evaluator(model_manager.load_currently_selected_model()) rankings = [] start_time = time() result_arr = FileArray('./results/generative_results_%s.bin'%model_manager.model_name, shape=(1000000, 1), dtype='i4') result_arr.open() progress = 0 print 'evaluating generative approach for', model_manager.model_name for instance in evaluation_sample_iterator(model_manager): progress += 1 prev_result = result_arr.read(progress) if prev_result >= 1: progress += 1 rankings.append(prev_result[0]-1) continue random_responses = [rand_iter.next() for x in xrange(9)] context = instance['context'] cost, answer, pred_utt_emb = answer_model(context) candidates = [(cosine(pred_utt_emb, instance['answer_utterance_emb']), True)] for random_resp, rand_utt_emb in random_responses: cost = cosine(pred_utt_emb, rand_utt_emb) candidates.append((cost, False)) candidates = sorted(candidates, key=lambda pair: pair[0]) rank = [idx for idx, cand in enumerate(candidates) if candidates[idx][1]][0] rankings.append(rank) result_arr.write(progress-1, np.array([rank+1], dtype='i4')) rATk = calculate_recall_at_k(rankings, 10) result_str = ' | '.join(['R@%i %.3f%%' % (k + 1, percentage * 100) for k, percentage in rATk.iteritems()]) print_progress_bar(instance['progress'], instance['conversations'], additional_text=result_str, start_time=start_time) result_arr.close()
def get_function_words(self, X, feature_names=[]): transformed = [] vect = self.function_word_vect for i, doc in enumerate(X): segments = [self.only_function_words(s) for s in doc] transformed.append(vect.transform(segments).toarray()) print_progress_bar(i + 1, len(X), description = 'function words') feature_names.extend(vect.get_feature_names()) return transformed
def compute_charcnn_embed(self, batch_size=200): ent_cnt = len(self.train.kb.datamap.entity_map) print("Precomputing charCNN embeddings") with torch.no_grad(): for i in range(0, ent_cnt, batch_size): utils.print_progress_bar(i, ent_cnt) inp = self.train.kb.charcnn_packaged( [numpy.arange(i, min(i + batch_size, ent_cnt))]) self.scoring_function.compute_char_embeddings( i, i + batch_size, inp[0]) print("charCNN embeddings computed")
def calculate_weights_count(data, train_positions, inverse_scaling, half_sigmoid_sharpness, size): words_left = {} words_right = {} words_global = {} data_length = len(data) for i, (entry, positions) in enumerate(zip(data, train_positions)): entry = entry.lower() fragments = [] positions.append(len(entry)) entry_marker = 0 for change in positions: fragments.append(entry[entry_marker:change]) entry_marker = change for fragment in fragments: fragment = re.sub("[^a-zA-Z]+", " ", fragment) words = word_tokenize(fragment) left = words[:size] right = words[-size:] for word in words: if word in words_global: words_global[word] += 1 else: words_global[word] = 1 for word in left: if word in words_left: words_left[word] = words_left[word] + 1 else: words_left[word] = 1 for word in right: if word in words_right: words_right[word] = words_right[word] + 1 else: words_right[word] = 1 print_progress_bar(i + 1, data_length, description = 'split_points_weights') remove_entries(words_left, stopwords.words('english')) remove_entries(words_right, stopwords.words('english')) words_left = min_max_dict(words_left) words_right = min_max_dict(words_right) for key, value in sorted(words_left.items(), key = itemgetter(1), reverse = True)[:50]: print(key, value) print('====================================================') for key, value in sorted(words_right.items(), key = itemgetter(1), reverse = True)[:50]: print(key, value) return words_left, words_right
def num_paragraphs(data, feature_names=[]): vectors = [] data_length = len(data) for i, entry in enumerate(data): vectors.append([float(entry.count('\n') + 1)]) print_progress_bar(i + 1, data_length, description = 'num_paragraphs') feature_names.extend(['num_paragraphs']) return vectors