Пример #1
0
    def learn_basis_over(self, eps=10e-8, text_path=None, block_sz=0):
        if not text_path:
            print "no text file off which to learn"
            return

        text_words = utils.extract_words(text_path, length=8)
        print 'len textwords', len(text_words)

        for k in trange(len(text_words)):
            if k < block_sz or k + block_sz >= len(text_words): continue

            word_vec = np.ones((1, self.N))
            for bidx in xrange(k - block_sz, k + block_sz + 1):
                word_vec = np.multiply(word_vec, self.wordz[text_words[bidx]])

            word_vec = word_vec / np.linalg.norm(word_vec)
            weights = word_vec.dot(self.basis)

            for i in xrange(self.b):
                self.basis[:, i] = np.reshape(
                    self.basis[:, i], (1, self.N)) + weights[0][i] * word_vec

            # normalize basis
            for i in xrange(self.b):
                self.basis[:, i] = np.reshape(self.basis[:, i],
                                              (1, self.N)) / np.linalg.norm(
                                                  self.basis[:, i])

            self.basis[self.basis < eps] = 0

        #print self.basis[:,1]
        return
Пример #2
0
    def add_words(self, text_path=None, byletter=0):
        if not text_path:
            print "no text file to load"
            return

        words = utils.extract_words(text_path, length=8)
        for word in words:
            if word in self.wordz: continue

            # generate ri vector
            if byletter:
                rand_vector = np.zeros((1, self.N))
                rand_idx = np.random.permutation(self.N)
                rand_vector[0, rand_idx[0:self.k]] = 1
                rand_vector[0, rand_idx[self.k:2 * self.k]] = -1
            else:
                rand_vector = np.zeros((1, self.N))
                for letter in list(word):
                    letter_idx = alphabet.index(letter)
                    letter_vec = self.RI_letters[:, letter_idx]
                    rand_vector = np.multiply(rand_vector,
                                              letter_vec[:, np.newaxis])

            # add to wordz
            self.wordz[word] = rand_vector
        return
    def get_all_words(self):
        words = defaultdict(lambda: 0)
        print 'delimiters: {}'.format(TITLE_DELIMITER)
        videos = self.db_handler.get_all_videos()
        for video in videos:
            for word in extract_words(video.title):
                words[prepare_word(word)] += 1

        return words
    def fill_words_for_videos(self):
        words = self.db_handler.db_session.query(Word).all()
        word_dict = {}
        for word in words:
            word_dict[word.word] = word

        videos = self.db_handler.get_all_videos()
        for video in videos:
            wordids = set()
            for word in extract_words(video.title):
                w = prepare_word(word)
                if w in word_dict:
                    wordids.add(word_dict[w].id)
            video.wordids = serialize_ids(wordids)

        self.db_handler.commit()
for x in a:
    unique_pos_tag_list = unique_pos_tag_list.union(set(x))

unique_pos_tag_list = list(unique_pos_tag_list)
unique_pos_tag_list.sort()
unique_pos_tag_list = unique_pos_tag_list[8:-1]
uptl = [x[:2] for x in unique_pos_tag_list]
uptl = list(set(uptl))
uptl.sort()
uptl = np.array(uptl)

treatment_suffix = ['in', 'apy', 'ine', 'tomy']
disease_suffix = ['is', 'ia']

trset1 = utils.extract_words('medicinenet-treatments.txt')
dis = utils.extract_words('malacards-diseases.txt')

dis = list(dis)
trset1 = list(trset1)

dis = set([x for x in dis if not x.isdigit()])
trset = set([x for x in trset1 if not x.isdigit()])

features = utils.extract_features(train_data, uptl, treatment_suffix,
                                  disease_suffix, dis)

vocabs['dis'] = dis
vocabs['treatment_suffix'] = treatment_suffix
vocabs['disease_suffix'] = disease_suffix
vocabs['uptl'] = uptl
    def calculate_title_rank(self, title, f):
        word_dict = self.get_word_dict_by_word()
        title_words = extract_words(title)

        title_rank = sum(f(word_dict[x]) for x in title_words if x in word_dict)
        return title_rank
Пример #7
0
import os
import sys

import utils

if __name__ == '__main__':
    print('main')
    if len(sys.argv) < 2:
        raise Exception('Enter the path of the directory')

    path = sys.argv[1]

    if not os.path.isdir(path):
        raise Exception('Invalid path : ', path)
    print('path', path)
    for idx, filename in enumerate(os.listdir(path)):
        # print(filename)
        tags, text = utils.extract_tags_risk(os.path.join(path, filename))

        words = utils.extract_words(text, tags, idx)

        df = utils.map_output(words)
        name = filename.split('.')[0]
        utils.save_to_csv('.', f'{name}.csv', df)
                              data_name=data_loader_dict['name'],
                              alphabet=data_loader_dict['alphabet'],
                              num_classes=data_loader_dict['num_classes'])

with open(config.prediction_filename, 'r') as f:
    predictions_dict = json.load(f)

references_lists, references_tokens = [], []
candidates_tokens = [[] for _ in range(0, FLAGS.beam_size)]
candidates_lists = [[] for _ in range(0, FLAGS.beam_size)]
classes, class_predictions = [], []

stats = collections.defaultdict(lambda: [])
for prediction_dict in predictions_dict['predictions']:
    opt_idx = prediction_dict.get('optimal_candidate_idx', None)
    target_list = utils.extract_words(prediction_dict['target'])
    references_lists.append(target_list)
    references_tokens.append(' '.join(
        map(str, vocabulary.encode_seq(target_list))))

    if 'class' in prediction_dict:
        classes.append(prediction_dict['class'])
    if 'class_prediction' in prediction_dict:
        class_predictions.append(prediction_dict['class_prediction'])

    for i, candidate_dict in enumerate(prediction_dict['candidates']):
        stats['losses'].append(candidate_dict['loss'])
        stats['prob_x_values'].append(candidate_dict['prob_x'])

        candidate_list = utils.extract_words(candidate_dict['prediction'])
        candidates_lists[i].append(candidate_list)