Python chunk_bag_of_word_collection_by_char_string 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: topic_modeling.utils

메소드/함수: chunk_bag_of_word_collection_by_char_string

hotexamples.com에서의 예제들: 3

Python chunk_bag_of_word_collection_by_char_string - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 topic_modeling.utils.chunk_bag_of_word_collection_by_char_string에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: python_based.py 프로젝트: lucasnoah/litmetricscore

def topic_modeling_celery_task(collection_data, options, user, *args, **kwargs):
    """
    Async tosk to do gensim based topic modeling.
    :param collection_data:
    :param options:
    :param user:
    :param args:
    :param kwargs:
    :return:
    """
    # get user from user id
    user = User.objects.get(pk=user)

    # get tokens from collection and parse with filters
    filtered_docs = []
    wordnet_status = options['wordNetSense']
    for item in collection_data:
        # overide the collections filter wordnet status.  This should probably live somewhere else in the future.
        tokens = CollectionParser(item['id'], item['filter'], wordnet_status=wordnet_status).get_bow()
        filtered_docs.append(tokens)

    # handle chunk by count case
    if options['chunking'] == "count":
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(bag, options['chunk_size'])

    # handle chunk by breakchar string
    if options['chunking'] == 'breakword':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_char_string(bag, options['breakword'])

    # handle no chunking
    if options['chunking'] == 'none':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags.append(bag)

    # set up and execute gensim modeling
    handler = LdaHandler(chunked_words_bags)
    handler.create_dictionary()
    handler.create_corpus()
    update_every = options.get('update_every') or 2
    del options['update_every']
    handler.train_lda_model(options['numTopics'], update_every, options['numPasses'], options)
    handler.lda_model.top_topics(handler.corpus, options['numTopics'])
    topics = handler.lda_model.top_topics(handler.corpus, num_words=options['top_n'])
    # create output models
    topic_group = build_and_save_topic_tuples_and_topic_groups(topics, user, collection_data, 'lda', options)
    # relate collections to topic group
    add_collections_to_topic_group(topic_group, collection_data)

    # email upon completion
    try:
        send_document_done_email(user)
    except Exception as e:
        print e
    return topics

예제 #2

파일 보기

파일: python_based.py 프로젝트: lucasnoah/litmetricscore

def hdp_celery_task(collection_data, options, user):
    """
    Async gensim HDP task
    :param collection_data:
    :param options:
    :param user:
    :return:
    """
    user = User.objects.get(pk=user)

    # get tokens from collection and filter them
    filtered_docs = []
    for item in collection_data:
        tokens = CollectionParser(item['id'], item['filter'], wordnet_status=options['wordNetSense']).get_bow()
        filtered_docs.append(tokens)

    # handle chunk by count case
    if options['chunking'] == "count":
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(bag, options['chunk_size'])

    # handle chunk by breakchar string
    if options['chunking'] == 'breakword':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_char_string(bag, options['breakword'])

    # handle no chunking
    if options['chunking'] == 'none':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags.append(bag)

    # set up and execute gensim modeling
    handler = LdaHandler(chunked_words_bags)
    handler.create_dictionary()
    handler.create_corpus()
    handler.train_hdp_model(options)

    topics = handler.hdp_model.show_topics(topics=-1, log=False, formatted=False)

    # create output models
    topic_group = build_and_save_topic_tuples_and_topic_groups(topics, user, collection_data, 'hdp', options)
    # relate collections to topic group
    add_collections_to_topic_group(topic_group, collection_data)

    # email upon completion
    try:
        send_document_done_email(user)
    except Exception as e:
        print e
    return topics

예제 #3

파일 보기

파일: python_based.py 프로젝트: lucasnoah/litmetricscore

def lsi_celery_task(collection_data, options, user):
    """
    Async task to perform lsa
    :param collection_data:
    :param options:
    :param user:
    :return:
    """
    user = User.objects.get(pk=user)

    # get tokens from collections and filter them
    filtered_docs = []
    for item in collection_data:
            tokens = CollectionParser(item['id'], item['filter'], options['wordNetSense']).get_bow()
            filtered_docs.append(tokens)

    # handle chunk by count case
    if options['chunking'] == "count":
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(bag, options['chunk_size'])

    # handle chunk by breakchar string
    if options['chunking'] == 'breakword':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_char_string(bag, options['breakword'])

    # handle no chunking
    if options['chunking'] == 'none':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags.append(bag)

    stringed_docs = []
    for doc in chunked_words_bags:
        stringed_docs.append(" ".join([x.lower() for x in doc]))

    # set up and execute gensim modeling
    try:
        transformer = TfidfVectorizer()
        tfidf = transformer.fit_transform(stringed_docs)
        num_components = 2
        if len(stringed_docs) < 2:
            num_components = 1
        svd = TruncatedSVD(n_components=num_components)
        lsa = svd.fit_transform(tfidf.T)
        terms = kClosestTerms(15, options['search_query'], transformer, lsa)
    except Exception as e:
        print e
        terms = ["No results found for search"]
    LsiResult(
        user=user,
        results=json.dumps(terms),
        query_term=options['search_query']
    ).save()
    result = LsiResult.objects.last()
    collections = [CorpusItemCollection.objects.get(pk=c.get('id')) for c in collection_data]
    for collection in collections:
        result.collections.add(collection)
    result.save()