Exemplo n.º 1
0
    def vocabs(self):
        """get char2idx, idx2char and tokens for given dataset"""
        if self.full:
            char2idx, idx2char = utils.get_vocabulary(self.corpus)

        else:
            char2idx, idx2char = utils.get_vocabulary(self.train_loader)

        return char2idx, idx2char
Exemplo n.º 2
0
	def __init__(self, root, voc, num_samples=np.inf,
				 label_transform=None,
				 voc_type='string', lowercase=False,
				 alphanumeric=False, ctc_blank='<b>',
				 return_list=False):
		super(lmdbDataset, self).__init__()

		self.env = lmdb.open(root, max_readers=100, readonly=True)

		assert self.env is not None, "cannot create lmdb from %s" % root
		self.txn = self.env.begin()

		self.voc = voc
		self.label_transform = label_transform
		self.nSamples = int(float(self.txn.get(b"num-samples")))
		self.nSamples = min(self.nSamples, num_samples)

		self.voc = get_vocabulary(voc, voc_type, lowercase, alphanumeric)
		self.char2id = dict(zip(self.voc, range(1, len(self.voc)+1))) # 0 reserved for ctc blank
		self.id2char = dict(zip(range(1, len(self.voc)+1), self.voc))
		self.char2id[ctc_blank] = 0
		self.id2char[0] = ctc_blank
		self.ctc_blank = ctc_blank
		self.lowercase = lowercase
		self.alphanumeric = alphanumeric
		self.rec_num_classes = len(self.id2char)
		self.return_list = return_list
Exemplo n.º 3
0
def test_get_vocabulary():

    data_dir = '/Users/aditinair/Desktop/NLU-DL/Contextual-Conversational-Model/data/processed_en/'

    return utils.get_vocabulary(1000, data_dir)
Exemplo n.º 4
0
request1 = requests.get(url)
file1 = ZipFile(BytesIO(request1.content))
file1.extractall()

url2 = "https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip"
request2 = requests.get(url)
file2 = ZipFile(BytesIO(request2.content))
file2.extractall()

"""

# Loading all captions, getting vocabulary

captions = load_file('Flickr8k.token.txt')
all_descriptions = load_all_descriptions(captions)
vocab, all_descriptions = get_vocabulary(all_descriptions)

# Saving captions

save_descriptions(all_descriptions, "final_captions.txt")

entire_dataset = load_file("final_captions.txt")

# getting model

if args.model == "inception":
    model = inception_model()
else:
    model = vgg_model()

# extracting features from model for all training images and saving it
Exemplo n.º 5
0
import utils
import time
import os
from config import *
from tqdm import tqdm

if __name__ == "__main__":
    vocab = utils.get_vocabulary()  # loads vocabulary present in system
    citations = utils.get_citations()  # loads citation counts for doc_ids
    while True:
        print("Enter a word to search the index for:")
        x = input()

        if x in vocab:
            start_time = time.time()
            if os.path.exists(("indexes/inverted_index_" + x + ".pbz2")):
                index = utils.load_index("indexes/inverted_index_" + x)
                loaded = x
            else:
                index = utils.load_index(filename="indexes/inverted_index_" +
                                         x[0])
                loaded = x[0]
            end_time = time.time()
            print(("Took {} seconds to load index " +
                   loaded).format(end_time - start_time))
            print(index[x]["doc_frequency"])  # print number of docs term is in
            for k in list(index[x]["doc_ids"].keys())[:10]:
                print(
                    k, index[x]["doc_ids"][k], citations[k]
                )  # print top 10 docs for term, how many times term in doc, and citations of doc
        else: