예제 #1
0
    def __init__(self,train_dist_path, test_dist_path,dis_phone_num,prob_trans_path):
        f = open(train_dist_path,'r')

        # train_prob
        ln=f.readline()
        temp_prob_str = ln.split(' ')[3:-1]
        self.train_prob = torch.FloatTensor(list(map(float, temp_prob_str)))
        # var
        ln=f.readline()
        temp_var_str = ln.split(' ')[3:-1]
        self.train_var = torch.FloatTensor(list(map(float, temp_var_str)))
        f.close()

        # test_prob
        f = open(test_dist_path,'r')
        ln=f.readline()
        temp_prob_str = ln.split(' ')[3:-1]
        self.test_prob = torch.FloatTensor(list(map(float, temp_prob_str)))
        # var
        ln=f.readline()
        temp_var_str = ln.split(' ')[3:-1]
        self.test_var = torch.FloatTensor(list(map(float, temp_var_str)))
        f.close()

        # ratio
        length=len(self.test_prob)
        self.prob_ratio = torch.min(self.test_prob/self.train_prob, torch.ones(length))
        print('self.prob_ratio:')
        print(self.prob_ratio)

        assert max(self.prob_ratio) <=1 , 'The prob ratio is larger than 1!?'
        assert min(self.test_var) > 0, 'The min of test var is <= 0!?'
        self.std_var_ratio = torch.sqrt(self.test_var/self.train_var)
        

        print('self.std_var_ratio:')    
        print(self.std_var_ratio)
        print('self.test_var.sum():')    
        print(self.test_var.sum())

        # The first 4 are zero unvoiced parts
        self.phone_prob = torch.FloatTensor([0,0,0,0, 0.0202, 0.0522, 0.0917, 0.0153, 0.0088, 0.0483, 0.0130, 0.0048, 0.0290, 0.0212, 0.0249, 0.0177, 0.0240, 0.0146, 0.0093, 0.0194, 0.0490, 0.0457, 0.0050, 0.0296, 0.0367, 0.0407, 0.0530, 0.0114, 0.0416, 0.0011, 0.0124, 0.0302, 0.0457, 0.0073, 0.0571, 0.0064, 0.0047, 0.0249, 0.0123, 0.0191, 0.0287, 0.0230, 0.0002])
        self.prob_sum = self.phone_prob.sum()
        phone_num = len(self.phone_prob)

        assert dis_phone_num >=0 , 'The dis_phone_num need to be non negtive!'
        self.dis_phone_num = dis_phone_num 

        threshold_prob = self.prob_sum/((phone_num-dis_phone_num)*0.8)
        self.prob_ratio_upper = torch.max(self.phone_prob/threshold_prob, 0.2*torch.ones(phone_num))

        # load the GMM params
        temp_mat = sio.loadmat(prob_trans_path)
        self.mu_ratio = torch.from_numpy(temp_mat['mu_ratio']).float()
        # weight cumulation sum
        self.comp_wcum = torch.from_numpy(temp_mat['comp_wcum']).float()
예제 #2
0
def guess_dim(fname):
    """
    This funcion guesses the dimensionality of the embedding.
    There can be two types of files: with a header (vocabsize and dim separated by a space)
    and without a header (the first line is the first embedding vector).
    """
    with open(fname) as F:
        first_line = F.readline().strip()
        p = first_line.split()
        if len(p) == 2:
            # this is the header line
            return int(p[1])
        else:
            # this is the first embedding vector
            return len(p) - 1  #first element is the word itself.
예제 #3
0
    def load_embeddings(self, words, embedding_file):
        """Load pretrained embeddings for a given list of words, if they exist.
        Args:
            words: iterable of tokens. Only those that are indexed in the
              dictionary are kept.
            embedding_file: path to text file of embeddings, space separated.
        """
        emb_layer = self.network.embedder.word_embeddings
        words = {w for w in words if w in self.src_dict}
        logger.info('Loading pre-trained embeddings for %d words from %s' %
                    (len(words), embedding_file))

        # When normalized, some words are duplicated. (Average the embeddings).
        vec_counts, embedding = {}, {}
        with open(embedding_file, encoding='utf8') as f:
            # Skip first line if of form count/dim.
            line = f.readline().rstrip().split(' ')
            if len(line) != 2:
                f.seek(0)

            duplicates = set()
            for line in tqdm(f, total=count_file_lines(embedding_file)):
                parsed = line.rstrip().split(' ')
                assert (len(parsed) == emb_layer.word_vec_size + 1)
                w = self.src_dict.normalize(parsed[0])
                if w in words:
                    vec = torch.Tensor([float(i) for i in parsed[1:]])
                    if w not in vec_counts:
                        vec_counts[w] = 1
                        embedding[w] = vec
                    else:
                        duplicates.add(w)
                        vec_counts[w] = vec_counts[w] + 1
                        embedding[w].add_(vec)

            if len(duplicates) > 0:
                logging.warning('WARN: Duplicate embedding found for %s' %
                                ', '.join(duplicates))

        for w, c in vec_counts.items():
            embedding[w].div_(c)

        emb_layer.init_word_vectors(self.src_dict, embedding,
                                    self.args.fix_embeddings)
        logger.info('Loaded %d embeddings (%.2f%%)' %
                    (len(vec_counts), 100 * len(vec_counts) / len(words)))
예제 #4
0
    def read_ann(file_p):
        anns = []
        with open(file_p, 'r') as F:
            while True:
                line = F.readline()
                if line == '':
                    break
                if line == '\n':
                    continue
                ann = line.strip().split('\t')
                #   	          print(ann)
                anns.append(ann)


#   	          break
        header_list = ["pmid", "off_a", "off_b", "name", "type", "id", 's_i']
        anns = pd.DataFrame(anns, columns=header_list)
        return anns