def selectModelsMulti(numModels,MODEL_FILE,INVERT_INDEX_FILE,numImages): model = smh.listdb_load(MODEL_FILE) ifs = smh.listdb_load(INVERT_INDEX_FILE) # For each model count the number of times an image appears, also count the numero of images that appear on a model allObjects = {} allObjectsCountImages = {} index = 0 for objectDiscovered in model.ldb: imageCount={} count=0 for visualWord in objectDiscovered: visualWordName = visualWord.item for image in ifs.ldb[visualWordName]: if image.item not in imageCount: imageCount[image.item] = 0 count+=1 imageCount[image.item] += 1 #count+=1 allObjects[index]=imageCount allObjectsCountImages[index]=count index+=1 # Sort models based on the number of images in each model allObjectsSorted = sorted(allObjectsCountImages.items(), key=lambda x: x[1], reverse=True)[0:numModels] # Assign each image a model, based on the sorted list imageToModel = [] averageLen = 0.0 for image in range(numImages): imageList = set() modelIndex = 0 for modelKey,_ in allObjectsSorted: currentModel = allObjects[modelKey] if image in currentModel: imageList.add(modelIndex) modelIndex+=1 averageLen += len(imageList) if len(imageList) == 0: imageList=-1 imageToModel.append(imageList) imageToModel = np.array(imageToModel) dictCount = {} for image in imageToModel: if image == -1: if -1 not in dictCount: dictCount[-1] = 0 dictCount[-1] += 1 else: for model in image: if model not in dictCount: dictCount[model] = 0 dictCount[model] += 1 print dictCount print 'Not Empty : '+str(averageLen/numImages) return imageToModel
def reorderTopics(modelName): newTopicsName = modelName.rstrip("models") + "ordered_models" newExplicitTopicsName = modelName.rstrip("models") + "ordered_models_words" vocpath = "./data/train_drugReviews40000.vocab" ifspath = "./data/train_drugReviews40000.ifs" models = listdb_load(modelName) vocabulary, docfreq = load_vocabulary(vocpath) # Generate dictionary of lists of documents associated to wordTokens ifs_dic = load_ifs_dic(ifspath) # Generate list of documents associated to topic associaNum = {} lenModels = 0 for topicID, topic in enumerate(models.ldb): topicDocs = {} lenModels += 1 for wordToken in topic: word_Docs = ifs_dic[wordToken.item] for doc in word_Docs: if doc not in topicDocs: topicDocs[doc] = True associaNum[topicID] = len(topicDocs) # Sorting Topics sortedIDs = sorted(associaNum.items(), key=lambda x: x[1], reverse=True) sortedDic = { } # sends topicID (line Num in listdb) to place in sorted list for i, item in enumerate(sortedIDs): sortedDic[item[0]] = i # Creating string array version of listdb with ordered topics, and an array with the words new_ldb = ["" for i in range(lenModels)] explicit_new = ["" for i in range(lenModels)] for topicID, topic in enumerate(models.ldb): topicPlace = sortedDic[topicID] topicStr = str(topic.size) explStr = str(associaNum[topicID]) + " ... " + str(topic.size) for itemBund in topic: topicStr += " " + str(itemBund.item) + ":" + str(itemBund.freq) explStr += " " + vocabulary[itemBund.item] + ":" + str( itemBund.freq) new_ldb[topicPlace] = topicStr explicit_new[topicPlace] = explStr # Save ordered listdb with codecs.open(newTopicsName, 'w', 'utf-8') as f: for line in new_ldb: f.write(line + "\n") with codecs.open(newExplicitTopicsName, 'w', 'utf-8') as f: for line in explicit_new: f.write(line + "\n")
def createModel(CORPUS_FILE, INVERT_INDEX_FILE, MODEL_FILE, number_of_tuples_l): corpus = smh.listdb_load(CORPUS_FILE) ifs = smh.listdb_load(INVERT_INDEX_FILE) discoverer = SMHD(tuple_size=2, number_of_tuples=number_of_tuples_l, min_set_size=3, overlap=0.8, min_cluster_size=3, cluster_tuple_size=3, cluster_number_of_tuples=255, cluster_table_size=2**24) print('Iniciar fit') models = discoverer.fit(ifs, prune=True, expand=corpus) print('Terminar fit') models.save(MODEL_FILE)
def selectModels(numModels,MODEL_FILE,INVERT_INDEX_FILE,numImages): model = smh.listdb_load(MODEL_FILE) ifs = smh.listdb_load(INVERT_INDEX_FILE) # For each model count the number of times an image appears, also count the numero of images that appear on a model allObjects = {} allObjectsCountImages = {} index = 0 for objectDiscovered in model.ldb: imageCount={} count=0 for visualWord in objectDiscovered: visualWordName = visualWord.item for image in ifs.ldb[visualWordName]: if image.item not in imageCount: imageCount[image.item] = 0 count+=1 imageCount[image.item] += 1 #count+=1 allObjects[index]=imageCount allObjectsCountImages[index]=count index+=1 # Sort models based on the number of images in each model allObjectsSorted = sorted(allObjectsCountImages.items(), key=lambda x: x[1], reverse=True)[0:numModels] # Assign each image a model, based on the sorted list imageToModel = np.zeros(numImages, dtype=int) for image in range(numImages): modelIndex = 0 imageToModel[image] = -1 for modelKey,_ in allObjectsSorted: currentModel = allObjects[modelKey] if image in currentModel: imageToModel[image] = modelIndex break modelIndex+=1 unique, counts = np.unique(imageToModel, return_counts=True) print dict(zip(unique, counts)) return imageToModel
def load_ifs_dic(ifspath): ifs_ldb = listdb_load(ifspath) ifs_dic = {} for wordToken, m in enumerate(ifs_ldb.ldb): docs = [] for j in m: docs.append(j.item) ifs_dic[wordToken] = docs return ifs_dic
def load_words2topics(w2tFileName): """ Loads list data base maps words to topics into dictionary. Returns array of tuples: [(docID,docFreq)_i] """ words2topics = {} w2t_ldb = listdb_load(w2tFileName) for wID, wordTopics in enumerate(w2t_ldb.ldb): listTopics = [] for topic in wordTopics: listTopics.append((topic.item, topic.freq)) words2topics[wID] = listTopics return words2topics
def createInvertedIndex(CORPUS_FILE, INVERT_INDEX_FILE): print('open') corpus = smh.listdb_load(CORPUS_FILE) print('invert') ifs = corpus.invert() ifs.save(INVERT_INDEX_FILE)
verbose("Reading index", lang) with open(os.path.join(args.idir, "{0}wiki.index".format(lang))) as INDEX: for idd, line in enumerate(INDEX): bits = line.strip().split(" == ") pos = int(bits[0]) idx = int(bits[1]) url = bits[2] title = bits[3] if not index.has_key(lang): index[lang] = {} index[lang][idd] = (pos, url, title) verbose("Loading topics:", args.TOPICS) topics = listdb_load(args.TOPICS) topics_ = [] for topic in topics.ldb: idxs = {} iterms = 0 for term in topic: if idx2lang.has_key(int(term.item)): try: idxs[idx2lang[int(term.item)]].add(int(term.item)) except KeyError: idxs[idx2lang[int(term.item)]] = set([int(term.item)]) iterms += 1 if args.min and iterms < args.min: continue topics_.append(idxs)
def discover_topics(ifspath, vocpath, savedir, tuple_size=3, number_of_tuples=None, table_size=2**20, cooccurrence_threshold=0.14, min_set_size=3, weightspath=None, corpuspath=None, cluster_tuple_size=3, cluster_number_of_tuples=255, cluster_table_size=2**20, overlap=0.7, min_cluster_size=3, top_terms_numbers=[10], seed=12345678): """ Discovers topics and evaluates model using topic coherence """ rng_init(seed) print "Loading inverted file from", ifspath ifs = listdb_load(ifspath) print "Loading vocabulary from", vocpath vocabulary, docfreq = load_vocabulary(vocpath) corpus = None if corpuspath: print "Loading corpus from", corpuspath corpus = listdb_load(corpuspath) weights = None if weightspath: print "Loading weights from", weightspath weights = Weights(weightspath) model = SMHTopicDiscovery( tuple_size=tuple_size, number_of_tuples=number_of_tuples, table_size=table_size, cooccurrence_threshold=cooccurrence_threshold, min_set_size=min_set_size, cluster_tuple_size=cluster_tuple_size, cluster_number_of_tuples=cluster_number_of_tuples, cluster_table_size=cluster_table_size, overlap=overlap, min_cluster_size=min_cluster_size) print "Parameters set to " print " tuple_size =", tuple_size print " number_of_tuples = ", number_of_tuples print " table_size = ", table_size print " cooccurrence_threshold = ", cooccurrence_threshold print " min_set_size = ", min_set_size print " cluster_tuple_size = ", cluster_tuple_size print " cluster_number_of_tuples = ", cluster_number_of_tuples print " cluster_table_size = ", cluster_table_size print " overlap = ", overlap print " min_cluster_size = ", min_cluster_size print "Discovering topics" start_time = time.time() model.fit(ifs, weights=weights, corpus=corpus) end_time = time.time() total_time = end_time - start_time corpusname = os.path.splitext(os.path.basename(ifspath))[0] mine_config = '_r' + str(tuple_size) + '_l' + str(model.number_of_tuples_)\ + '_w' + str(cooccurrence_threshold) mine_config = mine_config + '_s' + str(min_set_size) cluster_config = '_o' + str(overlap) + '_m' + str(min_cluster_size) modelfile = savedir + '/smh' + mine_config + cluster_config + corpusname + '.models' print "Saving resulting models to", modelfile model.models.save(modelfile) # sort models and save them with different top terms numbers topicfile = savedir + '/smh' + mine_config + cluster_config + corpusname + '_unsorted.topics' print "Saving the terms of the topic to", topicfile save_topics(topicfile, listdb_to_topics(model.models, vocabulary), top=None) print "Getting the document frequencies of the models" models_docfreq, topics = get_models_docfreq(model.models, vocabulary, docfreq) print "Sorting topics" for top in top_terms_numbers: sorted_topics = sort_topics(models_docfreq, topics, top=top) if top: top_str = '_top' + str(top) else: top_str = '_full' topicfile = savedir + '/smh' + mine_config + cluster_config + corpusname + top_str + '.topics' print "Saving the terms of the topic to", topicfile save_topics(topicfile, sorted_topics, top=top) timefile = savedir + '/smh' + mine_config + cluster_config + corpusname + '.time' print "Saving times to", timefile save_time(timefile, total_time)
import smh import pickle import operator INVERT_INDEX_FILE = 'google.ifs' MODEL_FILE = 'google.model' model = smh.listdb_load(MODEL_FILE) ifs = smh.listdb_load(INVERT_INDEX_FILE) idToFileName = () with open('indexToFile.pickle', 'rb') as handle: idToFileName = pickle.load(handle) # Create array with all images associated with a objectDiscovered allObjets = [] for objectDiscovered in model.ldb: imageCount = {} for visualWord in objectDiscovered: visualWordName = visualWord.item for image in ifs.ldb[visualWordName]: if image.item not in imageCount: imageCount[image.item] = 0 imageCount[image.item] += 1 allObjets.append(imageCount) print('Size of allObjets: {}'.format(len(allObjets))) allObjetsRankingFile = [] for objectDiscovered in allObjets: # Sort list of images based on number of visual words
verbose("Reading vocabulary") for lang in opts.LANG: verbose("Reading vocabulary", lang) with open( os.path.join(opts.idir, "{0}wiki{1}.voca".format(lang, opts.sufix))) as LANG: for line in LANG: bits = line.strip().split(" = ") w = bits[0] idx = int(bits[1]) idx2word[idx] = w word2idx[w] = idx verbose("Loading file ifs:", opts.IFS) ifs = listdb_load(opts.IFS) if not opts.l: params = [(int(r), s2l(s, r), s) for r, s in opts.params] else: params = [(int(r), int(l), 0) for r, l in opts.params] params.sort() cs = [] for r, l, s in params: verbose("======================================= experiment for", r, l, s) if s > 0: verbose( "Experiment tuples (r) {0}, Number of tuples (l) {1}, S* {2}".
def BOWcorpus2emb(corpusFN, vocSize, Train=False, Validate=False, labelsFN=None, forSMH=False): """ Returns a generator of bow embeddings, taking documents from the pointed .corpus file If using labelsFN, check that corpusFN and labelsFN are the same length (and correspond to each other) """ if Train and Validate: Validate = False Train = False # Setting the Train/Validate division prameters if Train or Validate: train_size = 0 with open(corpusFN, "r") as f: for line in f: train_size += 1 # Use to divide train / validate valNum = 0.2 valSize = int(train_size * valNum) random.seed(12345678) valIndexes = set(random.sample(range(train_size), valSize)) while True: if labelsFN: f = open(labelsFN, "r") corpus = listdb_load(corpusFN) for index, doc in enumerate(corpus.ldb): # Block to handle Train/Validate division if Train: if index in valIndexes: continue elif Validate: if index not in valIndexes: continue # Block ends. emb = [0 for i in range(vocSize)] for wordBundle in doc: if wordBundle.item < vocSize: emb[wordBundle.item] = wordBundle.freq if not forSMH: if labelsFN: label = float(f.readline()) yield (np.array([emb]), np.array([label])) else: yield (np.array([emb])) else: if labelsFN: label = float(f.readline()) yield emb, label else: yield emb if labelsFN: f.close()