示例#1
0
    mix_sources = [
        'amazon+twitter', 'amazon+ebay', 'twitter+ebay', 'amazon+twitter+ebay'
    ]
    tree_category = 'electronics'
    combos = [
        ('amazon', ['twitter', 'ebay'], tree_category),
        ('twitter', ['amazon', 'ebay'], tree_category),
        ('ebay', ['amazon', 'twitter'], tree_category),
        #new mixed traning set test
        #('amazon+twitter', ['amazon','twitter', 'ebay'], tree_category),
        #('amazon+ebay', ['amazon','twitter', 'ebay'], tree_category),
        #('twitter+ebay', ['amazon','twitter', 'ebay'], tree_category),
        ('amazon+twitter+ebay', ['amazon', 'twitter', 'ebay'], tree_category),
    ]
    corpus, y = util.read_from_sources(all_sources,
                                       tree_category=tree_category,
                                       max_examples=int(options.max_examples))

    corpus = util.stem_corpus(
        corpus) if options.use_stemming is True else corpus

    #generate combined corpus and y, for mixed traning set test.
    for mix in mix_sources:
        sourceList = mix.split('+')

        newY = []
        newCorpus = []
        for source in sourceList:
            newY = newY + y[source]
            newCorpus = newCorpus + corpus[source]
        y[mix] = newY
from gensim import corpora, models, similarities
import util

all_sources = ['amazon']
tree_category = 'books'
corpus, y = util.read_from_sources(all_sources, tree_category = tree_category, max_examples = 5000)

reviews = corpus[all_sources[0]]

cleaned_reviews = [util.clean_sentence(review.encode('ascii', 'ignore')) for review in reviews]

# gensim expects each example to be a list of words, instead of a long string
texts = [[word for word in r.split()] for r in cleaned_reviews]

dictionary = corpora.Dictionary(texts)

corpus_bow = [dictionary.doc2bow(text) for text in texts]

tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

# don't use this - HPD with TFIDF with crash your computer
# model = models.hdpmodel.HdpModel(corpus_bow, id2word=dictionary, T = 20)

model = models.ldamodel.LdaModel(corpus_tfidf, id2word=dictionary, num_topics = 20)

topic_distributions = list(model[corpus_tfidf])

# print_topic(topicid, topn=10)¶
model.print_topic(10)
    mix_sources = ['amazon+twitter', 'amazon+ebay', 'twitter+ebay', 'amazon+twitter+ebay']
    tree_category = 'videogames'
    
    combos = [('amazon', ['twitter', 'ebay'], tree_category),
                ('twitter', ['amazon', 'ebay'], tree_category),
                ('ebay', ['amazon', 'twitter'], tree_category),]
    """
                #new mixed traning set test
                ('amazon+twitter', ['amazon','twitter', 'ebay'], tree_category),
                ('amazon+ebay', ['amazon','twitter', 'ebay'], tree_category),
                ('twitter+ebay', ['amazon','twitter', 'ebay'], tree_category),
                ('amazon+twitter+ebay', ['amazon','twitter', 'ebay'], tree_category),

                ]
    """
    corpus, y = util.read_from_sources(all_sources, tree_category = tree_category, max_examples = int(options.max_examples))
    
    corpus = util.stem_corpus(corpus) if options.use_stemming is True else corpus
    for master_source, external_sources, tree_category in combos:
        learn_cross_domain(master_source, external_sources, corpus, y, tree_category)
#generate combined corpus and y, for mixed traning set test.
"""
    for mix in mix_sources:
        sourceList = mix.split('+')

        newY = []
        newCorpus = []
        for source in sourceList:
            newY = newY + y[source]
            newCorpus = newCorpus + corpus[source]
        y[mix] = newY