Exemplo n.º 1
0
def selectModelsMulti(numModels,MODEL_FILE,INVERT_INDEX_FILE,numImages):
	model = smh.listdb_load(MODEL_FILE)
	ifs = smh.listdb_load(INVERT_INDEX_FILE)

        # For each model count the number of times an image appears, also count the numero of images that appear on a model
    	allObjects = {}
        allObjectsCountImages = {}
        index = 0
	for objectDiscovered in model.ldb:
		imageCount={}
                count=0
		for visualWord in objectDiscovered:
			visualWordName = visualWord.item
			for image in ifs.ldb[visualWordName]:
				if image.item not in imageCount:
					imageCount[image.item] = 0
					count+=1
				imageCount[image.item] += 1
				#count+=1
		allObjects[index]=imageCount
		allObjectsCountImages[index]=count
		index+=1

	# Sort models based on the number of images in each model
	allObjectsSorted = sorted(allObjectsCountImages.items(), key=lambda x: x[1], reverse=True)[0:numModels]

        # Assign each image a model, based on the sorted list
	imageToModel = []
        averageLen = 0.0
	for image in range(numImages):
		imageList = set()
		modelIndex = 0
		for modelKey,_ in allObjectsSorted:
			currentModel = allObjects[modelKey]
			if image in currentModel:
				imageList.add(modelIndex)
			modelIndex+=1
		averageLen += len(imageList)
		if len(imageList) == 0:
			imageList=-1
		imageToModel.append(imageList)
	
	imageToModel = np.array(imageToModel)
	dictCount = {}
	for image in imageToModel:
		if image == -1:
			if -1 not in dictCount:
                                dictCount[-1] = 0
                        dictCount[-1] += 1
		else:
			for model in image:
				if model not in dictCount:
					dictCount[model] = 0
				dictCount[model] += 1
	print dictCount
	print 'Not Empty : '+str(averageLen/numImages)

	return imageToModel
def reorderTopics(modelName):

    newTopicsName = modelName.rstrip("models") + "ordered_models"
    newExplicitTopicsName = modelName.rstrip("models") + "ordered_models_words"
    vocpath = "./data/train_drugReviews40000.vocab"
    ifspath = "./data/train_drugReviews40000.ifs"

    models = listdb_load(modelName)
    vocabulary, docfreq = load_vocabulary(vocpath)

    # Generate dictionary of lists of documents associated to wordTokens
    ifs_dic = load_ifs_dic(ifspath)

    # Generate list of documents associated to topic
    associaNum = {}
    lenModels = 0
    for topicID, topic in enumerate(models.ldb):
        topicDocs = {}
        lenModels += 1

        for wordToken in topic:
            word_Docs = ifs_dic[wordToken.item]
            for doc in word_Docs:
                if doc not in topicDocs:
                    topicDocs[doc] = True

        associaNum[topicID] = len(topicDocs)

    # Sorting Topics
    sortedIDs = sorted(associaNum.items(), key=lambda x: x[1], reverse=True)
    sortedDic = {
    }  # sends topicID (line Num in listdb) to place in sorted list
    for i, item in enumerate(sortedIDs):
        sortedDic[item[0]] = i

    # Creating string array version of listdb with ordered topics, and an array with the words
    new_ldb = ["" for i in range(lenModels)]
    explicit_new = ["" for i in range(lenModels)]

    for topicID, topic in enumerate(models.ldb):
        topicPlace = sortedDic[topicID]
        topicStr = str(topic.size)
        explStr = str(associaNum[topicID]) + " ... " + str(topic.size)

        for itemBund in topic:
            topicStr += " " + str(itemBund.item) + ":" + str(itemBund.freq)
            explStr += " " + vocabulary[itemBund.item] + ":" + str(
                itemBund.freq)

        new_ldb[topicPlace] = topicStr
        explicit_new[topicPlace] = explStr

    # Save ordered listdb
    with codecs.open(newTopicsName, 'w', 'utf-8') as f:
        for line in new_ldb:
            f.write(line + "\n")

    with codecs.open(newExplicitTopicsName, 'w', 'utf-8') as f:
        for line in explicit_new:
            f.write(line + "\n")
Exemplo n.º 3
0
def createModel(CORPUS_FILE, INVERT_INDEX_FILE, MODEL_FILE,
                number_of_tuples_l):
    corpus = smh.listdb_load(CORPUS_FILE)
    ifs = smh.listdb_load(INVERT_INDEX_FILE)

    discoverer = SMHD(tuple_size=2,
                      number_of_tuples=number_of_tuples_l,
                      min_set_size=3,
                      overlap=0.8,
                      min_cluster_size=3,
                      cluster_tuple_size=3,
                      cluster_number_of_tuples=255,
                      cluster_table_size=2**24)

    print('Iniciar fit')
    models = discoverer.fit(ifs, prune=True, expand=corpus)
    print('Terminar fit')
    models.save(MODEL_FILE)
Exemplo n.º 4
0
def selectModels(numModels,MODEL_FILE,INVERT_INDEX_FILE,numImages):
	model = smh.listdb_load(MODEL_FILE)
	ifs = smh.listdb_load(INVERT_INDEX_FILE)

        # For each model count the number of times an image appears, also count the numero of images that appear on a model
    	allObjects = {}
        allObjectsCountImages = {}
        index = 0
	for objectDiscovered in model.ldb:
		imageCount={}
                count=0
		for visualWord in objectDiscovered:
			visualWordName = visualWord.item
			for image in ifs.ldb[visualWordName]:
				if image.item not in imageCount:
					imageCount[image.item] = 0
					count+=1
				imageCount[image.item] += 1
				#count+=1
		allObjects[index]=imageCount
		allObjectsCountImages[index]=count
		index+=1

	# Sort models based on the number of images in each model
	allObjectsSorted = sorted(allObjectsCountImages.items(), key=lambda x: x[1], reverse=True)[0:numModels]

        # Assign each image a model, based on the sorted list
	imageToModel = np.zeros(numImages, dtype=int)
	for image in range(numImages):
		modelIndex = 0
		imageToModel[image] = -1
		for modelKey,_ in allObjectsSorted:
			currentModel = allObjects[modelKey]
			if image in currentModel:
				imageToModel[image] = modelIndex
				break
			modelIndex+=1

	unique, counts = np.unique(imageToModel, return_counts=True)
	print dict(zip(unique, counts))
	return imageToModel
def load_ifs_dic(ifspath):
    ifs_ldb = listdb_load(ifspath)

    ifs_dic = {}

    for wordToken, m in enumerate(ifs_ldb.ldb):
        docs = []
        for j in m:
            docs.append(j.item)

        ifs_dic[wordToken] = docs

    return ifs_dic
Exemplo n.º 6
0
def load_words2topics(w2tFileName):
    """
	Loads list data base maps words to topics into dictionary.
	Returns array of tuples: [(docID,docFreq)_i]
	"""

    words2topics = {}

    w2t_ldb = listdb_load(w2tFileName)

    for wID, wordTopics in enumerate(w2t_ldb.ldb):
        listTopics = []
        for topic in wordTopics:
            listTopics.append((topic.item, topic.freq))
        words2topics[wID] = listTopics

    return words2topics
Exemplo n.º 7
0
def createInvertedIndex(CORPUS_FILE, INVERT_INDEX_FILE):
    print('open')
    corpus = smh.listdb_load(CORPUS_FILE)
    print('invert')
    ifs = corpus.invert()
    ifs.save(INVERT_INDEX_FILE)
Exemplo n.º 8
0
        verbose("Reading index", lang)
        with open(os.path.join(args.idir,
                               "{0}wiki.index".format(lang))) as INDEX:
            for idd, line in enumerate(INDEX):
                bits = line.strip().split(" == ")
                pos = int(bits[0])
                idx = int(bits[1])
                url = bits[2]
                title = bits[3]
                if not index.has_key(lang):
                    index[lang] = {}
                index[lang][idd] = (pos, url, title)

    verbose("Loading topics:", args.TOPICS)
    topics = listdb_load(args.TOPICS)
    topics_ = []
    for topic in topics.ldb:
        idxs = {}
        iterms = 0
        for term in topic:
            if idx2lang.has_key(int(term.item)):
                try:
                    idxs[idx2lang[int(term.item)]].add(int(term.item))
                except KeyError:
                    idxs[idx2lang[int(term.item)]] = set([int(term.item)])
            iterms += 1
        if args.min and iterms < args.min:
            continue
        topics_.append(idxs)
def discover_topics(ifspath,
                    vocpath,
                    savedir,
                    tuple_size=3,
                    number_of_tuples=None,
                    table_size=2**20,
                    cooccurrence_threshold=0.14,
                    min_set_size=3,
                    weightspath=None,
                    corpuspath=None,
                    cluster_tuple_size=3,
                    cluster_number_of_tuples=255,
                    cluster_table_size=2**20,
                    overlap=0.7,
                    min_cluster_size=3,
                    top_terms_numbers=[10],
                    seed=12345678):
    """
    Discovers topics and evaluates model using topic coherence
    """
    rng_init(seed)

    print "Loading inverted file from", ifspath
    ifs = listdb_load(ifspath)

    print "Loading vocabulary from", vocpath
    vocabulary, docfreq = load_vocabulary(vocpath)

    corpus = None
    if corpuspath:
        print "Loading corpus from", corpuspath
        corpus = listdb_load(corpuspath)

    weights = None
    if weightspath:
        print "Loading weights from", weightspath
        weights = Weights(weightspath)

    model = SMHTopicDiscovery(
        tuple_size=tuple_size,
        number_of_tuples=number_of_tuples,
        table_size=table_size,
        cooccurrence_threshold=cooccurrence_threshold,
        min_set_size=min_set_size,
        cluster_tuple_size=cluster_tuple_size,
        cluster_number_of_tuples=cluster_number_of_tuples,
        cluster_table_size=cluster_table_size,
        overlap=overlap,
        min_cluster_size=min_cluster_size)

    print "Parameters set to "
    print "   tuple_size =", tuple_size
    print "   number_of_tuples = ", number_of_tuples
    print "   table_size = ", table_size
    print "   cooccurrence_threshold = ", cooccurrence_threshold
    print "   min_set_size = ", min_set_size
    print "   cluster_tuple_size = ", cluster_tuple_size
    print "   cluster_number_of_tuples = ", cluster_number_of_tuples
    print "   cluster_table_size = ", cluster_table_size
    print "   overlap = ", overlap
    print "   min_cluster_size = ", min_cluster_size

    print "Discovering topics"
    start_time = time.time()
    model.fit(ifs, weights=weights, corpus=corpus)
    end_time = time.time()
    total_time = end_time - start_time

    corpusname = os.path.splitext(os.path.basename(ifspath))[0]
    mine_config = '_r' + str(tuple_size) + '_l' +  str(model.number_of_tuples_)\
                  + '_w' + str(cooccurrence_threshold)
    mine_config = mine_config + '_s' + str(min_set_size)
    cluster_config = '_o' + str(overlap) + '_m' + str(min_cluster_size)

    modelfile = savedir + '/smh' + mine_config + cluster_config + corpusname + '.models'
    print "Saving resulting models to", modelfile
    model.models.save(modelfile)

    # sort models and save them with different top terms numbers
    topicfile = savedir + '/smh' + mine_config + cluster_config + corpusname + '_unsorted.topics'
    print "Saving the terms of the topic to", topicfile
    save_topics(topicfile,
                listdb_to_topics(model.models, vocabulary),
                top=None)

    print "Getting the document frequencies of the models"
    models_docfreq, topics = get_models_docfreq(model.models, vocabulary,
                                                docfreq)

    print "Sorting topics"
    for top in top_terms_numbers:
        sorted_topics = sort_topics(models_docfreq, topics, top=top)
        if top:
            top_str = '_top' + str(top)
        else:
            top_str = '_full'

        topicfile = savedir + '/smh' + mine_config + cluster_config + corpusname + top_str + '.topics'
        print "Saving the terms of the topic to", topicfile
        save_topics(topicfile, sorted_topics, top=top)

    timefile = savedir + '/smh' + mine_config + cluster_config + corpusname + '.time'
    print "Saving times to", timefile
    save_time(timefile, total_time)
Exemplo n.º 10
0
import smh
import pickle
import operator

INVERT_INDEX_FILE = 'google.ifs'
MODEL_FILE = 'google.model'

model = smh.listdb_load(MODEL_FILE)
ifs = smh.listdb_load(INVERT_INDEX_FILE)
idToFileName = ()
with open('indexToFile.pickle', 'rb') as handle:
    idToFileName = pickle.load(handle)

# Create array with all images associated with a objectDiscovered
allObjets = []
for objectDiscovered in model.ldb:
    imageCount = {}
    for visualWord in objectDiscovered:
        visualWordName = visualWord.item
        for image in ifs.ldb[visualWordName]:
            if image.item not in imageCount:
                imageCount[image.item] = 0
            imageCount[image.item] += 1
    allObjets.append(imageCount)

print('Size of allObjets: {}'.format(len(allObjets)))

allObjetsRankingFile = []
for objectDiscovered in allObjets:

    # Sort list of images based on number of visual words
Exemplo n.º 11
0
    verbose("Reading vocabulary")
    for lang in opts.LANG:
        verbose("Reading vocabulary", lang)
        with open(
                os.path.join(opts.idir,
                             "{0}wiki{1}.voca".format(lang,
                                                      opts.sufix))) as LANG:
            for line in LANG:
                bits = line.strip().split(" = ")
                w = bits[0]
                idx = int(bits[1])
                idx2word[idx] = w
                word2idx[w] = idx

    verbose("Loading file ifs:", opts.IFS)
    ifs = listdb_load(opts.IFS)

    if not opts.l:
        params = [(int(r), s2l(s, r), s) for r, s in opts.params]
    else:
        params = [(int(r), int(l), 0) for r, l in opts.params]
    params.sort()

    cs = []

    for r, l, s in params:
        verbose("======================================= experiment for", r, l,
                s)
        if s > 0:
            verbose(
                "Experiment tuples (r) {0}, Number of tuples (l) {1}, S* {2}".
Exemplo n.º 12
0
def BOWcorpus2emb(corpusFN,
                  vocSize,
                  Train=False,
                  Validate=False,
                  labelsFN=None,
                  forSMH=False):
    """
	Returns a generator of bow embeddings, taking documents from the pointed .corpus file
	If using labelsFN, check that corpusFN and labelsFN are the same length (and correspond to each other)
	"""

    if Train and Validate:
        Validate = False
        Train = False

    # Setting the Train/Validate division prameters
    if Train or Validate:
        train_size = 0
        with open(corpusFN, "r") as f:
            for line in f:
                train_size += 1

        # Use to divide train / validate
        valNum = 0.2
        valSize = int(train_size * valNum)

        random.seed(12345678)
        valIndexes = set(random.sample(range(train_size), valSize))

    while True:
        if labelsFN:
            f = open(labelsFN, "r")

        corpus = listdb_load(corpusFN)

        for index, doc in enumerate(corpus.ldb):
            # Block to handle Train/Validate division
            if Train:
                if index in valIndexes:
                    continue
            elif Validate:
                if index not in valIndexes:
                    continue
            # Block ends.

            emb = [0 for i in range(vocSize)]

            for wordBundle in doc:

                if wordBundle.item < vocSize:
                    emb[wordBundle.item] = wordBundle.freq

            if not forSMH:
                if labelsFN:
                    label = float(f.readline())
                    yield (np.array([emb]), np.array([label]))
                else:
                    yield (np.array([emb]))
            else:
                if labelsFN:
                    label = float(f.readline())
                    yield emb, label
                else:
                    yield emb

        if labelsFN:
            f.close()