def test_vocab():
    user_vocab = Vocab(vocab_file='data/prediction/embeddings/user_vecs.vocab')
    print(f'user vocab length: {len(user_vocab)}')
    print([user_vocab[i] for i in range(5)])
    user_vectors = np.load('data/prediction/embeddings/user_vecs.npy')
    user_vectors = np.concatenate((np.zeros(
        (2, user_vectors.shape[1]), dtype=np.float), user_vectors),
                                  axis=0)
    print(f'user vectors shape: {user_vectors.shape}')
    print('-' * 30)

    sub_vocab = Vocab(vocab_file='data/prediction/embeddings/sub_vecs.vocab')
    print(f'sub vocab length: {len(sub_vocab)}')
    print([sub_vocab[i] for i in range(5)])
    sub_vectors = np.load('data/prediction/embeddings/sub_vecs.npy')
    sub_vectors = np.concatenate((np.zeros(
        (2, sub_vectors.shape[1]), dtype=np.float), sub_vectors),
                                 axis=0)
    print(f'sub vectors shape: {sub_vectors.shape}')
    print('-' * 30)

    words, word_vectors = load_glove_emb(
        'data/prediction/embeddings/glove_word_embeds.txt')
    word_vectors = np.concatenate((np.zeros(
        (2, word_vectors.shape[1]), dtype=np.float), word_vectors),
                                  axis=0)
    word_vocab = Vocab(words=words)
    print(f'word vocab length: {len(word_vocab)}')
    print([word_vocab[i] for i in range(5)])
    print(f'word vectors shape: {word_vectors.shape}')
Exemplo n.º 2
0
def build_vocab(name, dataset_list, cache_path, word_vec_path=None, feat_dim=None):
    logging.info('  building a language model...')
    if not os.path.exists(cache_path):
        lang_model = Vocab(name)
        for dataset in dataset_list:
            logging.info('    indexing words from {}'.format(dataset.lmdb_dir))
            index_words(lang_model, dataset.lmdb_dir)

        if word_vec_path is not None:
            lang_model.load_word_vectors(word_vec_path, feat_dim)

        with open(cache_path, 'wb') as f:
            pickle.dump(lang_model, f)
    else:
        logging.info('    loaded from {}'.format(cache_path))
        with open(cache_path, 'rb') as f:
            lang_model = pickle.load(f)

        if word_vec_path is None:
            lang_model.word_embedding_weights = None
        elif lang_model.word_embedding_weights.shape[0] != lang_model.n_words:
            logging.warning('    failed to load word embedding weights. check this')
            assert False

    return lang_model
Exemplo n.º 3
0
 def __init__(self, vocab_path, data_path_list, max_length):
     super(Dataset, self).__init__()
     
     self.vocab_path = vocab_path
     self.data_path_list = data_path_list    
     self.max_length = max_length
     self.data = None
     self.vocab = Vocab(self.vocab_path)
     self._prepareData()
Exemplo n.º 4
0
 def build_vocab(self, embed_file):
     word_counts = Counter()
     count_words(word_counts, [src + tgt for src, tgt in self.pairs])
     vocab = Vocab()
     for word, count in word_counts.most_common(config.max_vocab_size):
         vocab.add_words([word])
     if embed_file is not None:
         count = vocab.load_embeddings(embed_file)
         print('%d pre-trained embeddings loaded.' % count)
     return vocab
Exemplo n.º 5
0
def build_model(config):
    vocab = Vocab(config['vocab'])
    device = config['device']

    model = OCR(len(vocab), config)
    model = model.to(device)

    return model, vocab
Exemplo n.º 6
0
def build_model(config):
    vocab = Vocab(config['vocab'])
    device = config['device']

    model = VietOCR(len(vocab), config['backbone'], config['cnn'],
                    config['transformer'], config['seq_modeling'])
    model = model.to(device)

    return model, vocab
Exemplo n.º 7
0
    def _make_speaker_model(self, lmdb_dir, cache_path):
        logging.info('  building a speaker model...')
        speaker_model = Vocab('vid', insert_default_tokens=False)

        lmdb_env = lmdb.open(lmdb_dir, readonly=True, lock=False)
        txn = lmdb_env.begin(write=False)
        cursor = txn.cursor()
        for key, value in cursor:
            video = pyarrow.deserialize(value)
            vid = video['vid']
            speaker_model.index_word(vid)

        lmdb_env.close()
        logging.info('    indexed %d videos' % speaker_model.n_words)
        self.speaker_model = speaker_model

        # cache
        with open(cache_path, 'wb') as f:
            pickle.dump(self.speaker_model, f)
Exemplo n.º 8
0
# sampling
if settings.DOWN_SAMPLING:
    train_df_burst = train_df[train_df['label'] == 'burst']
    train_df_non_burst = train_df[train_df['label'] == 'non-burst']
    train_df = shuffle(
        pd.concat((train_df_non_burst.sample(
            n=int(len(train_df_burst) * settings.LABEL_RATIO)),
                   train_df_burst),
                  ignore_index=True))

# print(len(train_df[train_df['label'] == 'burst']))
# print(len(train_df[train_df['label'] == 'non-burst']))

# load vocab
user_vocab = Vocab(vocab_file=settings.USER_VOCAB_FN)
sub_vocab = Vocab(vocab_file=settings.SUB_VOCAB_FN)
words, word_vectors = load_glove_emb(fn=settings.GLOVE_EMBEDDING_FN)
word_vocab = Vocab(words=list(range(len(words))), additional_terms=False)
label_vocab = Vocab(words=['non-burst', 'burst'], additional_terms=False)

# make dataset
train_ds = RedditDataset(
    df=train_df,
    user_vocab=user_vocab,
    sub_vocab=sub_vocab,
    word_vocab=word_vocab,
    label_vocab=label_vocab,
    content_col='content',
)
Exemplo n.º 9
0
class Dataset(Dataset):
    """
    A dataset basically supports iteration over all the examples it contains.
    We currently supports only text data with this class.
    This class is inheriting Dataset class in torch.utils.data.
    """

    def __init__(self, vocab_path, data_path_list, max_length):
        super(Dataset, self).__init__()
        
        self.vocab_path = vocab_path
        self.data_path_list = data_path_list    
        self.max_length = max_length
        self.data = None
        self.vocab = Vocab(self.vocab_path)
        self._prepareData()
        
    def __getitem__(self, index):
        item_list = []
        for item in self.data[index]:
            item_list.append(self.vocab.sentence_to_indices(item))
            item_list.append(len(item))
        return item_list
                
    def __len__(self):
        return len(self.data)
    
    def _prepareData(self):
        data = self._readData()
        print("Read {} sentence pairs".format(len(data)))
        
        data = self._filterDatas(data)
        print("Trim data to {} sentence pairs \n".format(len(data)))
        
        print("[*] Success to preprocess data! \n")
        
        self.data = data

    def _readData(self):
        print("[*] Reading lines...")
    
        # Read the file and split into lines
        lines_list = [[self._preprocessing(l).split(' ') for l in open(file_path, 'r', encoding='utf-8').readlines()]
                      for file_path in self.data_path_list]
        data = list(zip(*lines_list))
        
        # Print statistics
        for i, lines in enumerate(lines_list):
            print("Avg length of data {} : {:.2f}".format(i, sum([len(l) for l in lines]) / len(data)))
        print()
        
        return data
    
    def _preprocessing(self, s):
        return s.strip().lower()
    
    def _filterDatas(self, data):
        data = [d for d in data if self._chkMaxLength(d)]
        return data

    def _chkMaxLength(self, p):
        return len(p[0]) <= self.max_length and len(p[1]) <= self.max_length and len(p[0]) > 0 and len(p[1]) > 0

    def getInstanceSize(self):
        return len(self.data)

    def getVocabSize(self):
        return self.vocab.__len__()
Exemplo n.º 10
0
    )

    if not os.path.exists(settings.TRAIN_DF_DUMP):
        with open(settings.TRAIN_DF_DUMP, mode='wb') as f:
            pickle.dump(train, f)

    if not os.path.exists(settings.TEST_DF_DUMP):
        with open(settings.TEST_DF_DUMP, mode='wb') as f:
            pickle.dump(test, f)

    if not os.path.exists(settings.DEV_DF_DUMP):
        with open(settings.DEV_DF_DUMP, mode='wb') as f:
            pickle.dump(dev, f)

    print('Loading vocab...')
    user_vocab = Vocab(vocab_file=settings.USER_VOCAB_FN)
    sub_vocab = Vocab(vocab_file=settings.SUB_VOCAB_FN)
    words, word_vecs = load_glove_emb(fn=settings.GLOVE_EMBEDDING_FN)
    word_vocab = Vocab(words=list(range(len(words))))
    label_vocab = Vocab(words=['non-burst', 'burst'])

    ds = RedditDataset(
        df=cross_label_tokenized,
        user_vocab=user_vocab,
        sub_vocab=sub_vocab,
        word_vocab=word_vocab,
        label_vocab=label_vocab,
        content_col='content',
    )

    # ds_it = iter(ds)
Exemplo n.º 11
0
    # Data Path Arguments
    parser.add_argument(
        '--acronyms_fn',
        default='../expansion_etl/data/derived/prototype_acronym_expansions.csv'
    )
    parser.add_argument(
        '--semgroups_fn',
        default='../expansion_etl/data/original/umls_semantic_groups.txt')

    # Model Distribution Hyperparameters
    parser.add_argument('--document_topic_prior', type=float, default=1.0)
    parser.add_argument('--topic_expansion_prior', type=float, default=1.0)

    args = parser.parse_args()

    semgroup_vocab = Vocab('semgroups')
    with open(args.semgroups_fn, 'r') as semgroup_fd:
        semgroup_vocab.add_tokens(
            list(map(lambda x: x.strip().split('|')[1], semgroup_fd)))

    acronyms = pd.read_csv(args.acronyms_fn)
    sf_vocab = {}
    sfs = acronyms['sf'].unique()
    print('Creating expansion vocabularies for {} short forms'.format(
        len(sfs)))
    for sf in sfs:
        sf_vocab[sf] = Vocab(sf)
        sf_vocab[sf].add_tokens(acronyms[acronyms['sf'] == sf]['lf'].tolist())
        print('\tVocabulary size of {} for {}'.format(sf_vocab[sf].size(), sf))

    # Model Dimensions & Hyperparameters