Пример #1
0
def do_lda(num_topics, passes, iterations, chunksize, tfidf, wiki_path=None):
    model_name = 'tweets_'

    logging.info('Loading user dictionary...')
    dictionary = corpora.Dictionary.load('data/dictionary/tweets.dict')
    corpus = dictionary.corpus

    if tfidf is True:
        logging.info('Computing TF-IDF...')
        tfidf_model = TfidfModel(corpus, normalize=False)
        corpus = tfidf_model[corpus]
        logging.info('Transforming the corpus...')
        corpus = [tfidf_model[corpus] for corpus in corpus]
        model_name += 'tfidf_'

    if wiki_path is not None:
        model_name += 'wiki_'

    logging.info('Performing LDA on user corpus...')
    model, vectors, ids = perform_lda(dictionary=dictionary, corpus=corpus, num_topics=num_topics, passes=passes,
                                      iterations=iterations, chunksize=chunksize, wiki_path=wiki_path)
    model.print_topics(topics=num_topics, topn=10)
    model.save('data/model/' + model_name + str(num_topics) + '.lda')
    enpickle(vectors, 'data/vector/' + model_name + str(num_topics) + '.pkl')
    enpickle(ids, 'data/vector/ids.pkl')
Пример #2
0
def crawl_wiki(model_path):
    engine = Wikipedia(license=None, throttle=1.0, language='en')
    wikis = {}
    keywords = get_keywords(model_path=model_path, threshold=0.001)
    for keyword in keywords:
        stop = False
        while stop is False:
            try:
                article = engine.search(query=keyword)
            except Exception as e:
                print str(e)
                article = None

            if type(article) is pattern.web.WikipediaArticle:
                if article.disambiguation is False:
                    print '\nretrieving', keyword, '...',
                    wikis[keyword] = {}
                    wikis[keyword]['keyword'] = keyword
                    wikis[keyword]['text'] = article.plaintext()
                    stop = True
                else:
                    print '\n[', keyword, '] leads to disambiguation page!',
                    stop = True

                    if '-' in keyword:
                        keyword = re.sub('-', ' ', keyword)  # convert hyphen into white space
                        stop = False
                    if keyword.islower() and len(keyword) <= 5:
                        keyword = keyword.upper()
                        stop = False
            else:
                print '\n[', keyword, '] doesn\'t exist on wikipedia!',
                stop = True

                if '-' in keyword:
                    keyword = re.sub('-', ' ', keyword)  # convert hyphen into white space
                    stop = False
                if keyword.islower() and len(keyword) <= 5:
                    keyword = keyword.upper()
                    stop = False

    enpickle(wikis, 'data/others/wikis.pkl')
    print '\n'
    return wikis
Пример #3
0
    count = 0
    doc_num = len(wikis)
    new_wikis = []
    keywords = []
    for keyword, wiki in wikis.items():
        count += 1

        print '\r', count, '/', doc_num,
        text = wiki['text']
        cleaned = clean_text(text)  # delete irrelevant characters

        wiki = []
        tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos)  # lemmatize
        for token in tokens:
            word, pos = token.split('/')
            wiki.append(word)

        # convert compound word into one token
        wiki = convert_compound(wiki)

        # filter stop words, long words, and non-english words
        wiki = [w for w in wiki if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]  # FIXME: this allows non-english characters to be stored

        new_wikis.append(wiki)
        keywords.append(keyword)

    print '\n'

    logging.info('Saving wiki corpus...')
    enpickle(new_wikis, 'data/processed/wikis.pkl')
Пример #4
0
            continue
        if not text_index.has_key(text):
            text_index[text] = ''
            tweets[id] = text

    # other english tweets
    other_file = open('data/original/agreegatedEnglishTweets.csv', 'rb')
    other_csv = csv.reader(other_file)
    for row in other_csv:
        id, text = row
        id = 'other_' + id
        if len(text) < 10:
            continue
        if not text_index.has_key(text):
            text_index[text] = ''
            tweets[id] = text

    # additional tweets
    other_file = open('data/original/additional.csv', 'rb')
    other_csv = csv.reader(other_file)
    for row in other_csv:
        id, text = row
        id = 'additional_' + id
        if len(text) < 10:
            continue
        if not text_index.has_key(text):
            text_index[text] = ''
            tweets[id] = text

    enpickle(tweets, 'data/processed/tweets.pkl')