Пример #1
0
def createLsiModelforCorpus(corpusfile, dictfile, numtop):
    print "\nLoading dictionary..."
    dict = corpora.Dictionary.load_from_text(dictfile)
    print(dict)
    print "\nLoading corpus..."
    corpus = corpora.MmCorpus(corpusfile)
    print(corpus)
    print "\nPerforming Latent Semantic Indexing..."
    lsi = LsiModel(corpus=corpus, num_topics=numtop, id2word=dict, distributed=False)
    ## This is the fancy stochastic (aka truncated) SVD, however it throws runtime memory errors for me (e.g. segmentation fault)
    #lsi = stochastic_svd(corpus,rank=100,num_terms=args.ntopics)
    corpustopics=lsi.show_topics(num_words=10, log=True, formatted=False)

    rootdir=os.getcwd()
    foldername='lsi_output'
    folderpath=os.path.join(rootdir,foldername)
    if (os.path.exists(folderpath)==True):
        shutil.rmtree(folderpath)
        os.makedirs(folderpath)
    else:
        os.makedirs(folderpath)
    os.chdir(folderpath)
    lsimodelfile=(str(args.corpus).replace('.mm',''))+'_lsi.model'
    lsi.save(lsimodelfile)
    filename1= (str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl'
    with open(filename1,'wb') as output:
        pickle.dump(corpustopics, output)
    os.chdir(rootdir)

    return corpustopics, lsi
Пример #2
0
def train_model(filename, output_name, data={}):
    output = data

    output['dataset'] = filename
    output['output_name'] = output_name

    df = pd.read_csv('./data/dataset/%s' % filename)
    lemmas_list = []

    for lemmas in df['lemmas']:
        lemmas = str(lemmas)
        lemmas = lemmas.replace('[', '').replace(']', '').replace(',', '').replace('\'', '')
        lemmas_list.append(lemmas.split())

    dictionary = corpora.Dictionary(lemmas_list)
    make_dir('./data/dicts/')
    dictionary.save('./data/dicts/%s_corpus.dict' % output_name)

    output['dict'] = '%s_corpus.dict' % output_name

    clean_doc = [dictionary.doc2bow(text) for text in lemmas_list]

    tfidf = models.TfidfModel(clean_doc, normalize=True)

    lsi = LsiModel(corpus=tfidf[clean_doc], id2word=dictionary, num_topics=200)
    make_dir('./data/models')
    lsi.save('./data/models/%s_model.txt' % output_name)
    output['model'] = '%s_model.txt' % output_name

    return output
Пример #3
0
class MyModel:
    def __init__(self, dict_file=None, corpus_model=None, corpus_file=None):
        self.dict_file = dict_file
        self.dictionary = None
        self.corpus = None

        if dict_file is not None:
            self.dictionary = corpora.Dictionary.load(dict_file)
        if corpus_model:
            self.corpus = self.corpus_model
        elif corpus_file:
            self.corpus = corpora.MmCorpus(corpus_file)

        self.tf_idf_model = None
        self.corpus_tf_idf = None
        self.lsi_model = None
        self.corpus_lsi = None

        self.lda_model = None
        self.corpus_lda = None

    def tf_idf(self):
        self.tf_idf_model = models.TfidfModel(corpus=self.corpus, normalize=True)
        # corpus_vector = [vector for vector in self.corpus]
        self.corpus_tf_idf = self.tf_idf_model[self.corpus]

    def lsi(self):
        self.tf_idf()
        if self.corpus_tf_idf and self.dictionary:
            self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2)
            self.corpus_lsi = self.lsi_model[self.corpus_tf_idf]
            print self.lsi_model.print_topic(2)
        elif self.corpus_tf_idf:
            self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2)
            self.corpus_lsi = self.lsi_model[self.corpus_tf_idf]

    def lda(self):
        self.lda_model = models.LsiModel(corpus=self.corpus)
        self.corpus_lda = self.lda_model[self.corpus]

    def add_document_lsi(self, addition_corpus_tf_idf, addition_vector_tf_idf):
        self.lsi_model.add_documents(addition_corpus_tf_idf)
        lsi_vector = self.lsi_model[addition_vector_tf_idf]
        return lsi_vector

    def save_lsi(self, name='/serialise/model.lsi'):
        self.lsi_model.save(name)

    def save_lda(self, name='/serialise/model.lda'):
        self.lda_model.save(name)

    @staticmethod
    def load_lsi(name='/tmp/model.lsi'):
        my_model = MyModel()
        my_model.lsi_model = models.LsiModel.load(name)
        return my_model
Пример #4
0
def run():
  try:
    print "starting to build LSI Model"

    start = datetime.now()
    documents = Feature.objects.exclude(text=None).values_list("text", flat=True)
    number_of_documents = len(documents)
    print "number_of_documents:", number_of_documents

    stopwords = []
    stopwords += [month.lower() for month in month_to_number.keys()]
    stopwords += nltk_stopwords.words('english')
    print "stopwords:", len(stopwords)
    with open(path_to_directory_of_this_file + "/stopwords.txt") as f:
        stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")])
    stopwords = set(stopwords)

    texts = [[word for word in document.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() if word not in stopwords and len(word) > 3 ] for document in documents]

    counter = Counter()
    for text in texts:
        counter.update(text)

    texts = [[token for token in text if counter[token] > 1] for text in texts]

    dictionary = Dictionary(texts)
    print "dictionary:", dictionary
    dictionary.save(path_to_directory_of_this_file + "/dictionary")

    corpus = [dictionary.doc2bow(text) for text in texts]
    print "corpus:", type(corpus)

    print "generating lsi model"
    
    lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10)
    print "saving LSI model"
    lsi.save(path_to_directory_of_this_file + "/model")

    Topic.objects.all().delete()
    topics = []
    for topic in lsi.show_topics():
        topics.append(Topic(id=topic[0], name=prettify_topic(topic[1])))

    Topic.objects.bulk_create(topics)

  except Exception as e:
    print e
Пример #5
0
def main():
    try:
        dictionary = Dictionary.load_from_text("dictionary.txt")
    except:
        dictionary = Dictionary(rcv1_train)
        dictionary.filter_extremes()
        dictionary.save_as_text("dictionary.txt")

    class RCV1BowCorpus(object):
        def __iter__(self):
            for document in rcv1_train:
                yield dictionary.doc2bow(document)

    ln.debug("Training model on %s documents" % len(rcv1_train))
    try:
        vector_model = LsiModel.load("lsi_model")
    except:
        vector_model = LsiModel(corpus=RCV1BowCorpus(),
                                num_topics=100,
                                id2word=dictionary)
        vector_model.save("lsi_model")

    def get_lsi_features(text):
        """
        Must return either numpy array or dictionary
        """
        res = vector_model[dictionary.doc2bow(text)]
        return dict(res)

    def get_bow_features(text):
        return dict(dictionary.doc2bow(text))

    clf = train_classifier(train_samples=rcv1_train,
                           train_targets=rcv1_train_target,
                           get_features=get_lsi_features,
                           classifier="sgd")

    evaluate_classifier(clf,
                        rcv1_test,
                        rcv1_test_target,
                        get_features=get_lsi_features)
Пример #6
0
    def get_lsa_model(self, n_topics=50, recalculate=False, from_scratch=True):

        filepath = self.paths.get_lsa_filepath(n_topics)

        if not os.path.isfile(filepath) or recalculate:

            if not from_scratch:
                raise ValueError('No LSA file exists but from_scratch is False')

            trigram_dictionary = self.lda_builder.get_corpus_dict()
            trigram_bow_corpus = self.lda_builder.get_trigram_bow_corpus(trigram_dictionary)

            print('Building LSA model...')
            lsi = LsiModel(trigram_bow_corpus, id2word=trigram_dictionary, num_topics=n_topics)

            lsi.save(filepath)
            print('LSA model (n_topics={}) written to {}'.format(n_topics, filepath))
        else:
            print('Loading LSA model (n_topics={})...'.format(n_topics))
            lsi = LsiModel.load(filepath)

        return lsi
class TextProcessor:
    def __init__(self, n_users, n_samples, n_dims):
        self.nUsers, self.nSamples, self.nDims = n_users, n_samples, n_dims
        self.tfIdfModel = self.lsiModel = self.ldaModel = self.w2vModel = self.dictionary = None

        self.dictPath, self.tfIdfPath, self.lsiPath, self.ldaPath, self.w2vPath, self.w2vVecPath =\
            conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='dict'), \
            conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='tfidf'),\
            conf.get_filename_via_tpl('model', model_type='lsi', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lsi_model'), \
            conf.get_filename_via_tpl('model', model_type='lda', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lda_model'),\
            conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='w2vmodel'), \
            conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='vec.txt')

    def load_model(self, model_type):
        model = None
        try:
            if model_type == 'tfidf':
                model = TfidfModel.load(self.tfIdfPath, mmap='r')
                self.tfIdfModel = model
            elif model_type == 'lsi':
                model = LsiModel.load(self.lsiPath, mmap='r')
                self.lsiModel = model
            elif model_type == 'lda':
                model = LdaModel.load(self.ldaPath, mmap='r')
                self.ldaModel = model
            elif model_type == 'w2v':
                model = Word2Vec.load(self.w2vPath, mmap='r')
                self.w2vModel = model
            else:
                logger.error('Model type error. Unexpected %s' % model_type)
                return None

            if self.dictionary is None and os.path.exists(self.dictPath):
                self.dictionary = corpora.Dictionary.load(self.dictPath)

            logger.info('%s model loaded completely.' % model_type)
        except IOError:
            logger.error(
                'The %s model doesn\'t exist. Please train the model before load it.'
                % model_type)
        finally:
            return model

    def tf_idf_transform(self, doc):
        """
        Perform tf-idf transformation on doc.
        """
        self.dictionary = corpora.Dictionary(doc)
        corpus = [self.dictionary.doc2bow(text) for text in doc]
        self.tfIdfModel = TfidfModel(corpus)

        conf.mk_dir(self.tfIdfPath)

        self.dictionary.save(self.dictPath)
        logger.info('Dictionary has been saved in %s.' % self.dictPath)

        self.tfIdfModel.save(self.tfIdfPath)
        logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath)

        tfidf_corpus = self.tfIdfModel[corpus]
        tfidf_corpus_path = conf.get_filename_via_tpl('tfidf',
                                                      n_users=self.nUsers,
                                                      postfix='mm',
                                                      n_samples=self.nSamples)
        corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus)
        logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' %
                    (np.array(tfidf_corpus).shape, tfidf_corpus_path))

        return tfidf_corpus

    def lsi_transform(self, corpus_tf_idf):
        logger.info('Training lsi model with a n_dims of %d...' % self.nDims)
        if self.dictionary is None and os.path.exists(self.dictPath):
            self.dictionary = corpora.Dictionary.load(self.dictPath)

        self.lsiModel = LsiModel(corpus=corpus_tf_idf,
                                 num_topics=self.nDims,
                                 id2word=self.dictionary)
        # print self.lsiModel[corpus]

        conf.mk_dir(self.lsiPath)

        self.lsiModel.save(self.lsiPath)
        logger.info('Lsi model has been saved in %s.' % self.lsiPath)

        lsi_corpus = self.lsiModel[corpus_tf_idf]
        lsi_corpus_path = conf.get_filename_via_tpl('lsi',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims,
                                                    postfix='mm')
        conf.mk_dir(lsi_corpus_path)
        corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus)
        logger.info('Lsi corpus with a shape of %s has been saved in %s.' %
                    (np.array(lsi_corpus).shape, lsi_corpus_path))

        return lsi_corpus

    def lda_transform(self,
                      corpus_tf_idf,
                      train_separated=False,
                      is_update=False):
        """
        Init a lda model with a n_topics whose default is 500, then fit it with corpus_tf_idf and transform it.
        :param corpus_tf_idf: Corpus which has been transformed into tf-idf matrix.
        :param train_separated: The model is going to be train with all corpus one time or some of them separately one time.
        :param is_update: Whether the training to be perform is to construct a new model or update one existed.
        :return: lda corpus.
        """
        logger.info('Training lda model with a n_dims of %d...' % self.nDims)
        if self.dictionary is None and os.path.exists(self.dictPath):
            self.dictionary = corpora.Dictionary.load(self.dictPath)

        if is_update:
            # A ldaModel had been trained before and now update the model with other corpus.
            if self.ldaModel is None:
                self.load_model('lda')
            self.ldaModel.update(corpus_tf_idf)
            logger.info('Lda model has been updated successfully.')
            return self.ldaModel[corpus_tf_idf]

        if train_separated:
            # corpus = []
            # spacing = 10000
            # for i in range(int(len(corpus_tf_idf)/spacing)):
            #     corpus.append(corpus_tf_idf[i*spacing: i])
            # self.ldaModel = LdaModel()
            pass

        self.ldaModel = LdaModel(corpus=corpus_tf_idf,
                                 num_topics=self.nDims,
                                 id2word=self.dictionary)

        conf.mk_dir(self.ldaPath)
        self.ldaModel.save(self.ldaPath)
        logger.info('lda model has been saved in %s' % self.ldaPath)

        lda_corpus = self.ldaModel[corpus_tf_idf]
        lda_corpus_path = conf.get_filename_via_tpl('lda',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims,
                                                    postfix='mm')
        conf.mk_dir(lda_corpus_path)
        corpora.MmCorpus.serialize(lda_corpus_path, lda_corpus)
        logger.info('Lda corpus with a shape of %s has been saved in %s.' %
                    (np.array(lda_corpus).shape, lda_corpus_path))

        return lda_corpus

    def w2v_transform(self, sentences):
        """
        Perform word2vec on texts and obtain a w2v model.
        :param sentences: Sentences that each one of it contains a list of words of a text.
        :return: W2v model.
        """
        logger.info('Training w2v model with a dim of %d...' % self.nDims)
        # file = open(infile_path, 'r', encoding='utf-8') if infile_path.find('\n') < 0 else StringIO(infile_path)
        # sentences = []
        # for sen in file.readlines():
        #     sentences.append(sen.strip().split(' '))
        # print(sentences)
        self.w2vModel = Word2Vec(sentences, size=self.nDims, min_count=0)

        conf.mk_dir(self.w2vPath)
        self.w2vModel.save(self.w2vPath)
        self.w2vModel.wv.save_word2vec_format(self.w2vVecPath, binary=False)
        # print(model['['])

        # Construct w2v corpus
        w2v_corpus = []
        for sen in sentences:
            vec = [0] * self.nDims
            if len(sen) > 0:
                for word in sen:
                    vec = list(
                        map(lambda m, n: m + n, vec, self.w2vModel[word]))
                    # vec += self.w2vModel[word]
            w2v_corpus.append(vec)

        w2v_corpus_path = conf.get_filename_via_tpl('w2v',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims)
        conf.mk_dir(w2v_corpus_path)

        with open(w2v_corpus_path, 'w') as fp:
            csv_writer = csv.writer(fp)
            for line in w2v_corpus:
                csv_writer.writerow(line)
        logger.info('W2v corpus has been saved in %s. ' % w2v_corpus_path)

        return w2v_corpus

    def load_corpus(self, model_type, dense=False):
        corpus = None
        try:
            if model_type == 'tfidf':
                corpus = corpora.MmCorpus(
                    conf.get_filename_via_tpl('tfidf',
                                              n_users=self.nUsers,
                                              postfix='mm',
                                              n_samples=self.nSamples))
            elif model_type in ['lsi', 'lda']:
                corpus = corpora.MmCorpus(
                    conf.get_filename_via_tpl(model_type,
                                              n_users=self.nUsers,
                                              n_samples=self.nSamples,
                                              n_dims=self.nDims,
                                              postfix='mm'))
            elif model_type == 'w2v':
                corpus = np.loadtxt(conf.get_filename_via_tpl(
                    model_type,
                    n_users=self.nUsers,
                    n_samples=self.nSamples,
                    n_dims=self.nDims),
                                    dtype=np.float,
                                    delimiter=',')

            logger.info('%s corpus with a shape of %s has been loaded. ' %
                        (model_type, np.array(corpus).shape))

            if dense and model_type in ['tfidf', 'lsi', 'lda']:
                corpus = matutils.corpus2dense(corpus,
                                               self.nDims,
                                               self.nSamples * self.nUsers,
                                               dtype=np.float).T
            else:
                corpus = np.array(corpus)
        except Exception as e:
            raise e
        return corpus

    @staticmethod
    def corpus2dense(corpus, n_terms, n_docs=conf.N_SAMPLES, dtype=np.float):
        return matutils.corpus2dense(corpus, n_terms, n_docs, dtype).T

    def load_vec(self, vec_type):
        logger.info('Loading %s vectors...' % vec_type)
        try:
            corpus_vec = self.load_corpus(vec_type, True)
        except Exception as e:
            raise e
        data = []
        for i in range(self.nUsers):
            data.append(corpus_vec[i * self.nSamples:(i + 1) * self.nSamples])
        data = np.array(data, dtype=np.float)
        return data
def main():
    parser = ArgumentParser(
        description=
        'wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information'
    )
    parser.add_argument('-ds',
                        '--dataset',
                        default='wiki',
                        help='What kind of dataset to use. (wiki,es,file)')
    parser.add_argument('-d',
                        '--dump-file',
                        help='Wiki: bz2 dump file with wiki in it')
    parser.add_argument('-l',
                        '--limit',
                        help='Wiki: How many documents to extract from wiki')
    parser.add_argument('--model-id',
                        default='model',
                        help='Filename for created model.')
    parser.add_argument(
        '--model-type',
        default='lsi',
        help='Model type (lsi, lda, word2vec, hdp, vocabulary).')
    parser.add_argument('--n-topics',
                        default=10,
                        help='Number of topics to model.')
    parser.add_argument('--n-passes',
                        default=1,
                        help='Number of passes for LDA  model.')
    parser.add_argument('--w2v-size',
                        default=100,
                        help='size of Word2Vec context.')
    parser.add_argument('--w2v-window', default=5, help='window for Word2Vec.')
    parser.add_argument('-q',
                        '--query',
                        default=None,
                        help='Elasticsearch: Query to use to fetch documents')
    parser.add_argument('--index', help='Elasticsearch: index to read from.')
    parser.add_argument('--doc_type',
                        default='doc',
                        help='Elasticsearch: data type in index.')
    parser.add_argument(
        '--data-dir',
        help='Directory to save the generated models and vocabularies into.')
    parser.add_argument(
        '--vocab',
        help=
        'Prebuilt Vocabulary file. Use this to avoid having to generate one.')

    opts = parser.parse_args()

    model_type = opts.model_type.lower()
    if model_type not in ['lsi', 'lda', 'word2vec', 'hdp', 'vocabulary']:
        logging.error("Invalid model type %s" % model_type)
        parser.print_usage()
        exit(-1)

    logging.info("Using model type %s" % model_type)

    dump_fn = opts.dump_file
    limit = int(opts.limit) if opts.limit else None

    data_type = opts.dataset.lower()
    if data_type not in ['es', 'wiki', 'file']:
        logging.error("Invalid dataset  type %s" % data_type)
        parser.print_usage()
        exit(-1)
    limit = None
    if opts.limit:
        limit = int(opts.limit)
    if not dump_fn and data_type in ['wiki']:
        logging.error('--dump-file required for wiki dataset')
        sys.exit(1)

    query = opts.query
    index = opts.index
    doc_type = opts.doc_type
    if data_type == 'es' and index is None:
        logging.error(
            "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter"
        )
        sys.exit(1)

    n_topics = int(opts.n_topics)
    n_passes = int(opts.n_passes)
    logging.info("Using %d topics." % n_topics)
    data_dir = opts.data_dir
    model_id = opts.model_id
    model_fn = '%s_%s_%d' % (model_id, model_type, n_topics)
    if data_dir:
        model_fn = '%s/%s' % (data_dir, model_fn)
    if model_type == 'word2vec':
        w2v_size = int(opts.w2v_size)
        w2v_window = int(opts.w2v_window)
        model_fn = '%s_w_%s_s_%s' % (model_fn, w2v_window, w2v_size)
    logging.info("Writing models to %s." % model_fn)

    if data_type == 'es':
        logging.info("Using data type %s with index %s, doc_type %s query %s" %
                     (data_type, index, doc_type, query))
        dataset = ElasticsearchDataset(read_index=index,
                                       read_doc_type=doc_type,
                                       query=query,
                                       normalize_func=normalize_es)
    elif data_type == 'wiki':
        logging.info("Using data type %s with dump_file %s and limit %s" %
                     (data_type, dump_fn, limit))
        dataset = WikipediaDataset(dump_fn=dump_fn,
                                   num_articles=limit,
                                   normalize_func=normalize_wiki)
    elif data_type == 'file':
        logging.info("Using data type %s with dump_file %s and limit %s" %
                     (data_type, dump_fn, limit))
        dataset = FileDataset(dump_fn=dump_fn,
                              num_articles=limit,
                              normalize_func=normalize_file)
    vocab_file = opts.vocab
    vocab = Dictionary()
    sw = set(stopwords.words('norwegian'))
    if not vocab_file or model_type == 'vocabulary':
        vocab.add_documents([get_tokenized(page, sw) for page in dataset])
        vocab.filter_extremes()
        vocab.compactify()
        vocab.save(model_fn + '.vocab')
    else:
        vocab = Dictionary.load(vocab_file)
    if model_type == 'vocabulary':
        return
    tfidf = TfidfModel(dictionary=vocab)
    if model_type == 'lsi':
        corpus = IterableDataset(dataset, sw, vocab)
        model = LsiModel(corpus=tfidf[corpus],
                         num_topics=n_topics,
                         id2word=vocab)
    elif model_type == 'lda':
        corpus = IterableDataset(dataset, sw, vocab)
        model = LdaModel(corpus=tfidf[corpus],
                         num_topics=n_topics,
                         passes=n_passes,
                         id2word=vocab)

    elif model_type == 'word2vec':
        corpus = IterableDataset(dataset, sw, vocab, doc2bow=False)
        corpus.dictionary = vocab
        model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size)
    elif model_type == 'hdp':
        corpus = IterableDataset(dataset, sw, vocab)
        model = HdpModel(corpus=tfidf[corpus], id2word=vocab)

    logging.info(model)
    model.save(model_fn)
Пример #9
0
def run():
  try:
    print("starting to build LSI Model")

    start = datetime.now()
    documents = Feature.objects.exclude(text=None).values_list("text", flat=True)
    number_of_documents = len(documents)
    print("number_of_documents:", number_of_documents)

    texts = [tokenize(document) for document in documents]

    counter = Counter()
    for text in texts:
        counter.update(text)

    texts = [[token for token in text if counter[token] > 1] for text in texts]

    print("texts:", len(texts), texts[:5])

    dictionary = Dictionary(texts)
    #print "dictionary:", dictionary
    dictionary.save(path_to_directory_of_this_file + "/dictionary")

    corpus = [dictionary.doc2bow(text) for text in texts]
    print("corpus:", type(corpus))

    print("generating lsi model")
    
    lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10)
    print("saving LSI model")
    lsi.save(path_to_directory_of_this_file + "/model")


    # nullifyin all topics on features and places
    Feature.objects.exclude(topic=None).update(topic=None)
    Place.objects.exclude(topic=None).update(topic=None)

    Topic.objects.all().delete()
    print("deleted all topics")
    topics = []
    for topic in lsi.show_topics():
        topics.append(Topic(id=topic[0], name=prettify_topic(topic[1])))

    Topic.objects.bulk_create(topics)
    print("bulk created all topics")


    """
    # re-create topics for all features in database
    for feature in Feature.objects.exclude(text=None).exclude(text=""):
        words = tokenize(feature.text)
        if words:
            probabilities = lsi[dictionary.doc2bow(words)]
            if probabilities:
                topic_id = sorted(probabilities, key=lambda tup: -1*tup[1])[0][0]
                if topic_id:
                    feature.topic_id = topic_id
                    feature.save()

    # assign as topic to each place based on most popular topic found in features
    for place_id in Place.objects.exclude(featureplace=None).values_list("id", flat=True):
        counter = Counter(Feature.objects.filter(featureplace__place_id=place_id).values_list("topic_id"))
        print "counter:", counter
    """


  except Exception as e:
    print(e)
Пример #10
0
    # corpus_test_word_seg_tfidf = model.__getitem__(corpus_test_word_seg)
    # corpora.MmCorpus.serialize('../topic_model/corpus_test_word_seg_tfidf', corpus_test_word_seg_tfidf)

    corpus_train_word_seg_tfidf = corpora.MmCorpus('../topic_model/corpus_train_word_seg_tfidf')
    corpus_dev_word_seg_tfidf = corpora.MmCorpus('../topic_model/corpus_dev_word_seg_tfidf')
    corpus_test_word_seg_tfidf = corpora.MmCorpus('../topic_model/corpus_test_word_seg_tfidf')
    corpus_word_seg_tfidf = []
    corpus_word_seg_tfidf.extend(corpus_train_word_seg_tfidf)
    corpus_word_seg_tfidf.extend(corpus_dev_word_seg_tfidf)
    corpus_word_seg_tfidf.extend(corpus_test_word_seg_tfidf)


    # lsi
    print('Start train lsi...')
    lsi_model = LsiModel(corpus=corpus_word_seg_tfidf, id2word=dictionary_word_seg, num_topics=400)
    lsi_model.save('../topic_model/word_seg_lsi_model')
    corpus_train_word_seg_lsi = lsi_model[corpus_train_word_seg_tfidf]
    corpus_dev_word_seg_lsi = lsi_model[corpus_dev_word_seg_tfidf]
    corpus_test_word_seg_lsi = lsi_model[corpus_test_word_seg_tfidf]
    corpora.MmCorpus.serialize('../topic_model/corpus_train_word_seg_lsi', corpus_train_word_seg_lsi)
    corpora.MmCorpus.serialize('../topic_model/corpus_dev_word_seg_lsi', corpus_dev_word_seg_lsi)
    corpora.MmCorpus.serialize('../topic_model/corpus_test_word_seg_lsi', corpus_test_word_seg_lsi)

    #lda
    print('Start train lda...')
    lda_model = LdaModel(corpus=corpus_word_seg_tfidf, id2word=dictionary_word_seg, num_topics=100, update_every=1,
                         chunksize=1000, passes=1)
    lda_model.save('../topic_model/word_seg_lda_model')
    corpus_train_word_seg_lda = lda_model[corpus_train_word_seg_tfidf]
    corpus_dev_word_seg_lda = lda_model[corpus_dev_word_seg_tfidf]
    corpus_test_word_seg_lda = lda_model[corpus_test_word_seg_tfidf]
def main():
    parser = ArgumentParser(
        description="wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information"
    )
    parser.add_argument("-ds", "--dataset", default="wiki", help="What kind of dataset to use. (wiki,es,file)")
    parser.add_argument("-d", "--dump-file", help="Wiki: bz2 dump file with wiki in it")
    parser.add_argument("-l", "--limit", help="Wiki: How many documents to extract from wiki")
    parser.add_argument("--model-id", default="model", help="Filename for created model.")
    parser.add_argument("--model-type", default="lsi", help="Model type (lsi, lda, word2vec, hdp, vocabulary).")
    parser.add_argument("--n-topics", default=10, help="Number of topics to model.")
    parser.add_argument("--n-passes", default=1, help="Number of passes for LDA  model.")
    parser.add_argument("--w2v-size", default=100, help="size of Word2Vec context.")
    parser.add_argument("--w2v-window", default=5, help="window for Word2Vec.")
    parser.add_argument("-q", "--query", default=None, help="Elasticsearch: Query to use to fetch documents")
    parser.add_argument("--index", help="Elasticsearch: index to read from.")
    parser.add_argument("--doc_type", default="doc", help="Elasticsearch: data type in index.")
    parser.add_argument("--data-dir", help="Directory to save the generated models and vocabularies into.")
    parser.add_argument("--vocab", help="Prebuilt Vocabulary file. Use this to avoid having to generate one.")

    opts = parser.parse_args()

    model_type = opts.model_type.lower()
    if model_type not in ["lsi", "lda", "word2vec", "hdp", "vocabulary"]:
        logging.error("Invalid model type %s" % model_type)
        parser.print_usage()
        exit(-1)

    logging.info("Using model type %s" % model_type)

    dump_fn = opts.dump_file
    limit = int(opts.limit) if opts.limit else None

    data_type = opts.dataset.lower()
    if data_type not in ["es", "wiki", "file"]:
        logging.error("Invalid dataset  type %s" % data_type)
        parser.print_usage()
        exit(-1)
    limit = None
    if opts.limit:
        limit = int(opts.limit)
    if not dump_fn and data_type in ["wiki"]:
        logging.error("--dump-file required for wiki dataset")
        sys.exit(1)

    query = opts.query
    index = opts.index
    doc_type = opts.doc_type
    if data_type == "es" and index is None:
        logging.error(
            "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter"
        )
        sys.exit(1)

    n_topics = int(opts.n_topics)
    n_passes = int(opts.n_passes)
    logging.info("Using %d topics." % n_topics)
    data_dir = opts.data_dir
    model_id = opts.model_id
    model_fn = "%s_%s_%d" % (model_id, model_type, n_topics)
    if data_dir:
        model_fn = "%s/%s" % (data_dir, model_fn)
    if model_type == "word2vec":
        w2v_size = int(opts.w2v_size)
        w2v_window = int(opts.w2v_window)
        model_fn = "%s_w_%s_s_%s" % (model_fn, w2v_window, w2v_size)
    logging.info("Writing models to %s." % model_fn)

    if data_type == "es":
        logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query))
        dataset = ElasticsearchDataset(
            read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es
        )
    elif data_type == "wiki":
        logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
        dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki)
    elif data_type == "file":
        logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
        dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file)
    vocab_file = opts.vocab
    vocab = Dictionary()
    sw = set(stopwords.words("norwegian"))
    if not vocab_file or model_type == "vocabulary":
        vocab.add_documents([get_tokenized(page, sw) for page in dataset])
        vocab.filter_extremes()
        vocab.compactify()
        vocab.save(model_fn + ".vocab")
    else:
        vocab = Dictionary.load(vocab_file)
    if model_type == "vocabulary":
        return
    tfidf = TfidfModel(dictionary=vocab)
    if model_type == "lsi":
        corpus = IterableDataset(dataset, sw, vocab)
        model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab)
    elif model_type == "lda":
        corpus = IterableDataset(dataset, sw, vocab)
        model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab)

    elif model_type == "word2vec":
        corpus = IterableDataset(dataset, sw, vocab, doc2bow=False)
        corpus.dictionary = vocab
        model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size)
    elif model_type == "hdp":
        corpus = IterableDataset(dataset, sw, vocab)
        model = HdpModel(corpus=tfidf[corpus], id2word=vocab)

    logging.info(model)
    model.save(model_fn)
Пример #12
0
pp.pprint(sortedqueryresult)
#pp.pprint(sortedqueryresult[:10])

rootdir=os.getcwd()
foldername='lsi_output'
folderpath=os.path.join(rootdir,foldername)
if (os.path.exists(folderpath)==True):
    shutil.rmtree(folderpath)
    os.makedirs(folderpath)
else:
    os.makedirs(folderpath)

os.chdir(folderpath)

lsimodelfile=(str(args.corpus).replace('.mm',''))+'_lsi.model'
lsi.save(lsimodelfile)
filename1= (str(args.corpus).replace('.mm',''))+'_lsi_topics.txt'
filename2= (str(args.corpus).replace('.mm',''))+('_item_{0}_classification.txt'.format(args.query))
f = open(filename1,'w')
f.seek(0)
f.write(str(corpustopics))
f.close()
f = open(filename2,'w')
f.seek(0)
f.write(str(queryresult))
f.close()

os.chdir(rootdir)

end_time=time.time()
runtime=end_time-start_time
Пример #13
0
class Feature(catscorpus.CatsCorpus, utils.Config):
	"""
	
	"""

	def __init__(self, root_path, is_tfidf=False):
		catscorpus.CatsCorpus.__init__(self, root_path=root_path)
		# Select training corpus
		self.is_tfidf = is_tfidf
		if self.is_tfidf:
			self.training_corpus = self.tfidf  # Take tfidf matrxi as input
		else:
			self.training_corpus = self.corpus # Take bow corpus as input

	def encoder_lda(self, num_topics=100, chunksize=500):
		"""
		
		"""

		self.num_topics = num_topics
		# Train LDA based on training dataset
		self.lda = LdaModel(corpus=self.training_corpus, id2word=self.dictionary, \
			                num_topics=num_topics, update_every=1, chunksize=chunksize, passes=1)
		# Convert bow into topic vectors
		self.corpus_lda = self.lda[self.training_corpus]

	def encoder_lsi(self, num_components=100, chunksize=500, is_tfidf=False):
		"""
		
		"""

		self.num_components = num_components
		# Train LSI based on training dataset
		self.lsi = LsiModel(corpus=self.training_corpus, id2word=self.dictionary, \
		                           num_topics=num_components, chunksize=chunksize) # initialize an LSI transformation
		# Convert bow into LSI projections
		self.corpus_lsi = self.lsi[self.training_corpus]

	def encoder_gbrbm(self, n_hidden=1000, lr=0.01, n_epoches=10, batch_size=100):
		"""
		"""

		n_visible        = len(self.dictionary)
		training_dataset = corpus2dense(self.training_corpus, num_terms=n_visible).transpose()
		self.rbm = GBRBM(n_visible, n_hidden=n_hidden, learning_rate=lr, momentum=0.95, \
			             err_function='mse', use_tqdm=False, sample_visible=False, sigma=1)
		self.rbm.fit(training_dataset, n_epoches=n_epoches, batch_size=batch_size, \
			         shuffle=True, verbose=True)
		self.corpus_rbm = self.rbm.transform(training_dataset)

	def save_gbrbm(self, model_path=None, output_path=None):
		"""
		"""
		
		model_path  = "%s/%s" % (model_path, "model")
		output_path = "%s/%s" % (output_path, "npy.mat.txt")

		# if model_path:
			# self.rbm.save(model_path)
		if output_path:
			# numpy_matrix = corpus2dense(self.corpus_lda, num_terms=self.num_topics)
			np.savetxt(output_path, self.corpus_rbm, delimiter=',')


	def save_lda(self, model_path=None, output_path=None):
		"""

		"""

		model_path  = "%s/%s" % (model_path, "model")
		output_path = "%s/%s" % (output_path, "npy.mat.txt")

		if model_path:
			self.lda.save(model_path)
		if output_path:
			numpy_matrix = corpus2dense(self.corpus_lda, num_terms=self.num_topics).transpose()
			np.savetxt(output_path, numpy_matrix, delimiter=',')

	def save_lsi(self, model_path=None, output_path=None):
		"""

		"""

		model_path  = "%s/%s" % (model_path, "model")
		output_path = "%s/%s" % (output_path, "npy.mat.txt")

		if model_path:
			self.lsi.save(model_path)
		if output_path:
			numpy_matrix = corpus2dense(self.corpus_lsi, num_terms=self.num_components).transpose()
			np.savetxt(output_path, numpy_matrix, delimiter=',')

	def random_sampling(self, num_samples):
		catscorpus.CatsCorpus.random_sampling(self, num_samples)
		# Select training corpus
		if self.is_tfidf:
			self.training_corpus = self.tfidf  # Take tfidf matrxi as input
		else:
			self.training_corpus = self.corpus # Take bow corpus as input

	def category_sampling(self, categories):
		catscorpus.CatsCorpus.category_sampling(self, categories)
		# Select training corpus
		if self.is_tfidf:
			self.training_corpus = self.tfidf  # Take tfidf matrxi as input
		else:
			self.training_corpus = self.corpus # Take bow corpus as input


	def __iter__(self):
		pass