def learn_basis_over(self, eps=10e-8, text_path=None, block_sz=0): if not text_path: print "no text file off which to learn" return text_words = utils.extract_words(text_path, length=8) print 'len textwords', len(text_words) for k in trange(len(text_words)): if k < block_sz or k + block_sz >= len(text_words): continue word_vec = np.ones((1, self.N)) for bidx in xrange(k - block_sz, k + block_sz + 1): word_vec = np.multiply(word_vec, self.wordz[text_words[bidx]]) word_vec = word_vec / np.linalg.norm(word_vec) weights = word_vec.dot(self.basis) for i in xrange(self.b): self.basis[:, i] = np.reshape( self.basis[:, i], (1, self.N)) + weights[0][i] * word_vec # normalize basis for i in xrange(self.b): self.basis[:, i] = np.reshape(self.basis[:, i], (1, self.N)) / np.linalg.norm( self.basis[:, i]) self.basis[self.basis < eps] = 0 #print self.basis[:,1] return
def add_words(self, text_path=None, byletter=0): if not text_path: print "no text file to load" return words = utils.extract_words(text_path, length=8) for word in words: if word in self.wordz: continue # generate ri vector if byletter: rand_vector = np.zeros((1, self.N)) rand_idx = np.random.permutation(self.N) rand_vector[0, rand_idx[0:self.k]] = 1 rand_vector[0, rand_idx[self.k:2 * self.k]] = -1 else: rand_vector = np.zeros((1, self.N)) for letter in list(word): letter_idx = alphabet.index(letter) letter_vec = self.RI_letters[:, letter_idx] rand_vector = np.multiply(rand_vector, letter_vec[:, np.newaxis]) # add to wordz self.wordz[word] = rand_vector return
def get_all_words(self): words = defaultdict(lambda: 0) print 'delimiters: {}'.format(TITLE_DELIMITER) videos = self.db_handler.get_all_videos() for video in videos: for word in extract_words(video.title): words[prepare_word(word)] += 1 return words
def fill_words_for_videos(self): words = self.db_handler.db_session.query(Word).all() word_dict = {} for word in words: word_dict[word.word] = word videos = self.db_handler.get_all_videos() for video in videos: wordids = set() for word in extract_words(video.title): w = prepare_word(word) if w in word_dict: wordids.add(word_dict[w].id) video.wordids = serialize_ids(wordids) self.db_handler.commit()
for x in a: unique_pos_tag_list = unique_pos_tag_list.union(set(x)) unique_pos_tag_list = list(unique_pos_tag_list) unique_pos_tag_list.sort() unique_pos_tag_list = unique_pos_tag_list[8:-1] uptl = [x[:2] for x in unique_pos_tag_list] uptl = list(set(uptl)) uptl.sort() uptl = np.array(uptl) treatment_suffix = ['in', 'apy', 'ine', 'tomy'] disease_suffix = ['is', 'ia'] trset1 = utils.extract_words('medicinenet-treatments.txt') dis = utils.extract_words('malacards-diseases.txt') dis = list(dis) trset1 = list(trset1) dis = set([x for x in dis if not x.isdigit()]) trset = set([x for x in trset1 if not x.isdigit()]) features = utils.extract_features(train_data, uptl, treatment_suffix, disease_suffix, dis) vocabs['dis'] = dis vocabs['treatment_suffix'] = treatment_suffix vocabs['disease_suffix'] = disease_suffix vocabs['uptl'] = uptl
def calculate_title_rank(self, title, f): word_dict = self.get_word_dict_by_word() title_words = extract_words(title) title_rank = sum(f(word_dict[x]) for x in title_words if x in word_dict) return title_rank
import os import sys import utils if __name__ == '__main__': print('main') if len(sys.argv) < 2: raise Exception('Enter the path of the directory') path = sys.argv[1] if not os.path.isdir(path): raise Exception('Invalid path : ', path) print('path', path) for idx, filename in enumerate(os.listdir(path)): # print(filename) tags, text = utils.extract_tags_risk(os.path.join(path, filename)) words = utils.extract_words(text, tags, idx) df = utils.map_output(words) name = filename.split('.')[0] utils.save_to_csv('.', f'{name}.csv', df)
data_name=data_loader_dict['name'], alphabet=data_loader_dict['alphabet'], num_classes=data_loader_dict['num_classes']) with open(config.prediction_filename, 'r') as f: predictions_dict = json.load(f) references_lists, references_tokens = [], [] candidates_tokens = [[] for _ in range(0, FLAGS.beam_size)] candidates_lists = [[] for _ in range(0, FLAGS.beam_size)] classes, class_predictions = [], [] stats = collections.defaultdict(lambda: []) for prediction_dict in predictions_dict['predictions']: opt_idx = prediction_dict.get('optimal_candidate_idx', None) target_list = utils.extract_words(prediction_dict['target']) references_lists.append(target_list) references_tokens.append(' '.join( map(str, vocabulary.encode_seq(target_list)))) if 'class' in prediction_dict: classes.append(prediction_dict['class']) if 'class_prediction' in prediction_dict: class_predictions.append(prediction_dict['class_prediction']) for i, candidate_dict in enumerate(prediction_dict['candidates']): stats['losses'].append(candidate_dict['loss']) stats['prob_x_values'].append(candidate_dict['prob_x']) candidate_list = utils.extract_words(candidate_dict['prediction']) candidates_lists[i].append(candidate_list)