Python which_tags примеры использования

Язык программирования: Python

Пространство имен/Пакет: snowball

Метод/Функция: which_tags

Примеров на hotexamples.com: 4

Python which_tags - 4 примера найдено. Это лучшие примеры Python кода для snowball.which_tags, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: embedded_topic_modeling.py Проект: coexis/snowball

def train_unsupervised_model(k=10,
                             alpha=.1,
                             max_iters=25,
                             convergence_threshold=.001,
                             baseline=False,
                             silent=False):
    '''
    phi is the documents-to-topics matrix  
    '''
    global base_model
    base_model = None

    D = snowball.read_data()
    raw_tweets = D['tweet_text']

    ## 12/1 -- filter?
    tags = [snowball.which_tags(t) for t in raw_tweets]
    raw_tweets = [t for i, t in enumerate(raw_tweets) if not "#crc" in tags[i]]

    tokenized_tweets = [word_tokenize(tw) for tw in raw_tweets]

    def _seems_to_be_about_soccer(tweet):
        terms = [
            "worldcup", "ger", "usavcrc", "fra", "italia", "mexvcrc",
            "#mexvcrc", "nedvscrc", "#nedvscrc", "nedcrc", "#nedcrc",
            "itavscrc", "#itavscrc", "uruvscrc", "#uruvscrc", "worldcup2014",
            "#worldcup2014", "uruguay"
        ]
        return any([t.lower() in terms for t in tweet])

    indices_to_keep = [
        idx for idx in range(len(tokenized_tweets))
        if not _seems_to_be_about_soccer(tokenized_tweets[idx])
    ]
    raw_tweets = [raw_tweets[idx] for idx in indices_to_keep]
    tokenized_tweets = [tokenized_tweets[idx] for idx in indices_to_keep]

    n = len(tokenized_tweets)
    alphas = [alpha] * k
    phi = np.zeros((n, k))

    for i in range(n):
        # initialize doc rows
        phi[i, :] = np.random.dirichlet(alphas)

    # initial topic probability estimates
    pi = estimate_pi(phi)

    if not silent:
        print("initial assignments (random)...")
        print_top_tweets_for_topics(phi, raw_tweets, pi)

    iter_ = 0
    converged = False

    while not converged and iter_ < max_iters:

        #######
        # 1. update language models
        #######
        topics_to_models = retrain_language_models(tokenized_tweets,
                                                   phi,
                                                   baseline=baseline)

        #######
        # 2. re-estimate \phi
        #######
        phi = estimate_phi(tokenized_tweets, topics_to_models, pi)
        pi = estimate_pi(phi)

        #######
        # assess convergence
        #######
        if not silent:
            print_top_tweets_for_topics(phi, raw_tweets, pi, n=20)
        cur_LL = LL(topics_to_models, pi, phi, tokenized_tweets)
        print("finished iter: %s; LL: %s" % (iter_, cur_LL))
        #print("finished iter: %s" % iter_)
        print("\n")
        iter_ += 1

    # idx0 = (-1* phi[:,0]).argsort()[:50]
    return raw_tweets, tokenized_tweets, phi, pi, topics_to_models

Пример #2

Показать файл

Файл: embedded_topic_modeling.py Проект: bwallace/snowball

def train_unsupervised_model(k=10, alpha=.1, max_iters=25, 
                                convergence_threshold=.001, 
                                baseline=False, silent=False):
    '''
    phi is the documents-to-topics matrix  
    '''
    global base_model
    base_model = None

    D = snowball.read_data()
    raw_tweets = D['tweet_text']

    ## 12/1 -- filter?
    tags = [snowball.which_tags(t) for t in raw_tweets]
    raw_tweets = [t for i,t in enumerate(raw_tweets) if 
                        not "#crc" in tags[i]]

    tokenized_tweets = [word_tokenize(tw) for tw in raw_tweets]
   
    def _seems_to_be_about_soccer(tweet):  
        terms = ["worldcup", "ger", "usavcrc", "fra", "italia", 
                    "mexvcrc", "#mexvcrc", "nedvscrc", "#nedvscrc", 
                    "nedcrc", "#nedcrc", "itavscrc", "#itavscrc", 
                    "uruvscrc", "#uruvscrc", "worldcup2014", "#worldcup2014",
                    "uruguay"]
        return any([t.lower() in terms for t in tweet])
        
    indices_to_keep = [idx for idx in range(len(tokenized_tweets)) if 
                            not _seems_to_be_about_soccer(tokenized_tweets[idx])]
    raw_tweets = [raw_tweets[idx] for idx in indices_to_keep]
    tokenized_tweets = [tokenized_tweets[idx] for idx in indices_to_keep]

    n = len(tokenized_tweets)
    alphas = [alpha]*k
    phi = np.zeros((n, k))

    for i in range(n):
        # initialize doc rows
        phi[i,:] = np.random.dirichlet(alphas) 

    # initial topic probability estimates
    pi = estimate_pi(phi)

    if not silent:
        print("initial assignments (random)...")
        print_top_tweets_for_topics(phi, raw_tweets, pi)

    iter_ = 0
    converged = False 

    while not converged and iter_ < max_iters:

        #######
        # 1. update language models 
        #######
        topics_to_models = retrain_language_models(tokenized_tweets, phi, baseline=baseline)

        #######
        # 2. re-estimate \phi
        #######
        phi = estimate_phi(tokenized_tweets, topics_to_models, pi)
        pi = estimate_pi(phi)

        #######
        # assess convergence
        #######
        if not silent:
            print_top_tweets_for_topics(phi, raw_tweets, pi, n=20)
        cur_LL = LL(topics_to_models, pi, phi, tokenized_tweets)
        print("finished iter: %s; LL: %s" % (iter_, cur_LL))
        #print("finished iter: %s" % iter_)
        print("\n")
        iter_ += 1

    # idx0 = (-1* phi[:,0]).argsort()[:50]
    return raw_tweets, tokenized_tweets, phi, pi, topics_to_models

Пример #3

Показать файл

Файл: embedded_topic_modeling.py Проект: coexis/snowball

def _get_tags(tweets):
    tweet_tags = [snowball.which_tags(tw) for tw in tweets]
    unique_tag_set = list(set(itertools.chain(*tweet_tags)))
    return tweet_tags, unique_tag_set

Пример #4

Показать файл

Файл: embedded_topic_modeling.py Проект: bwallace/snowball

def _get_tags(tweets):
    tweet_tags = [snowball.which_tags(tw) for tw in tweets]
    unique_tag_set = list(set(itertools.chain(*tweet_tags)))
    return tweet_tags, unique_tag_set