def vocabs(self): """get char2idx, idx2char and tokens for given dataset""" if self.full: char2idx, idx2char = utils.get_vocabulary(self.corpus) else: char2idx, idx2char = utils.get_vocabulary(self.train_loader) return char2idx, idx2char
def __init__(self, root, voc, num_samples=np.inf, label_transform=None, voc_type='string', lowercase=False, alphanumeric=False, ctc_blank='<b>', return_list=False): super(lmdbDataset, self).__init__() self.env = lmdb.open(root, max_readers=100, readonly=True) assert self.env is not None, "cannot create lmdb from %s" % root self.txn = self.env.begin() self.voc = voc self.label_transform = label_transform self.nSamples = int(float(self.txn.get(b"num-samples"))) self.nSamples = min(self.nSamples, num_samples) self.voc = get_vocabulary(voc, voc_type, lowercase, alphanumeric) self.char2id = dict(zip(self.voc, range(1, len(self.voc)+1))) # 0 reserved for ctc blank self.id2char = dict(zip(range(1, len(self.voc)+1), self.voc)) self.char2id[ctc_blank] = 0 self.id2char[0] = ctc_blank self.ctc_blank = ctc_blank self.lowercase = lowercase self.alphanumeric = alphanumeric self.rec_num_classes = len(self.id2char) self.return_list = return_list
def test_get_vocabulary(): data_dir = '/Users/aditinair/Desktop/NLU-DL/Contextual-Conversational-Model/data/processed_en/' return utils.get_vocabulary(1000, data_dir)
request1 = requests.get(url) file1 = ZipFile(BytesIO(request1.content)) file1.extractall() url2 = "https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip" request2 = requests.get(url) file2 = ZipFile(BytesIO(request2.content)) file2.extractall() """ # Loading all captions, getting vocabulary captions = load_file('Flickr8k.token.txt') all_descriptions = load_all_descriptions(captions) vocab, all_descriptions = get_vocabulary(all_descriptions) # Saving captions save_descriptions(all_descriptions, "final_captions.txt") entire_dataset = load_file("final_captions.txt") # getting model if args.model == "inception": model = inception_model() else: model = vgg_model() # extracting features from model for all training images and saving it
import utils import time import os from config import * from tqdm import tqdm if __name__ == "__main__": vocab = utils.get_vocabulary() # loads vocabulary present in system citations = utils.get_citations() # loads citation counts for doc_ids while True: print("Enter a word to search the index for:") x = input() if x in vocab: start_time = time.time() if os.path.exists(("indexes/inverted_index_" + x + ".pbz2")): index = utils.load_index("indexes/inverted_index_" + x) loaded = x else: index = utils.load_index(filename="indexes/inverted_index_" + x[0]) loaded = x[0] end_time = time.time() print(("Took {} seconds to load index " + loaded).format(end_time - start_time)) print(index[x]["doc_frequency"]) # print number of docs term is in for k in list(index[x]["doc_ids"].keys())[:10]: print( k, index[x]["doc_ids"][k], citations[k] ) # print top 10 docs for term, how many times term in doc, and citations of doc else: