def topic_modeling_celery_task(collection_data, options, user, *args, **kwargs): """ Async tosk to do gensim based topic modeling. :param collection_data: :param options: :param user: :param args: :param kwargs: :return: """ # get user from user id user = User.objects.get(pk=user) # get tokens from collection and parse with filters filtered_docs = [] wordnet_status = options['wordNetSense'] for item in collection_data: # overide the collections filter wordnet status. This should probably live somewhere else in the future. tokens = CollectionParser(item['id'], item['filter'], wordnet_status=wordnet_status).get_bow() filtered_docs.append(tokens) # handle chunk by count case if options['chunking'] == "count": chunked_words_bags = [] for bag in filtered_docs: chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(bag, options['chunk_size']) # handle chunk by breakchar string if options['chunking'] == 'breakword': chunked_words_bags = [] for bag in filtered_docs: chunked_words_bags += chunk_bag_of_word_collection_by_char_string(bag, options['breakword']) # handle no chunking if options['chunking'] == 'none': chunked_words_bags = [] for bag in filtered_docs: chunked_words_bags.append(bag) # set up and execute gensim modeling handler = LdaHandler(chunked_words_bags) handler.create_dictionary() handler.create_corpus() update_every = options.get('update_every') or 2 del options['update_every'] handler.train_lda_model(options['numTopics'], update_every, options['numPasses'], options) handler.lda_model.top_topics(handler.corpus, options['numTopics']) topics = handler.lda_model.top_topics(handler.corpus, num_words=options['top_n']) # create output models topic_group = build_and_save_topic_tuples_and_topic_groups(topics, user, collection_data, 'lda', options) # relate collections to topic group add_collections_to_topic_group(topic_group, collection_data) # email upon completion try: send_document_done_email(user) except Exception as e: print e return topics
def hdp_celery_task(collection_data, options, user): """ Async gensim HDP task :param collection_data: :param options: :param user: :return: """ user = User.objects.get(pk=user) # get tokens from collection and filter them filtered_docs = [] for item in collection_data: tokens = CollectionParser(item['id'], item['filter'], wordnet_status=options['wordNetSense']).get_bow() filtered_docs.append(tokens) # handle chunk by count case if options['chunking'] == "count": chunked_words_bags = [] for bag in filtered_docs: chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(bag, options['chunk_size']) # handle chunk by breakchar string if options['chunking'] == 'breakword': chunked_words_bags = [] for bag in filtered_docs: chunked_words_bags += chunk_bag_of_word_collection_by_char_string(bag, options['breakword']) # handle no chunking if options['chunking'] == 'none': chunked_words_bags = [] for bag in filtered_docs: chunked_words_bags.append(bag) # set up and execute gensim modeling handler = LdaHandler(chunked_words_bags) handler.create_dictionary() handler.create_corpus() handler.train_hdp_model(options) topics = handler.hdp_model.show_topics(topics=-1, log=False, formatted=False) # create output models topic_group = build_and_save_topic_tuples_and_topic_groups(topics, user, collection_data, 'hdp', options) # relate collections to topic group add_collections_to_topic_group(topic_group, collection_data) # email upon completion try: send_document_done_email(user) except Exception as e: print e return topics
def lsi_celery_task(collection_data, options, user): """ Async task to perform lsa :param collection_data: :param options: :param user: :return: """ user = User.objects.get(pk=user) # get tokens from collections and filter them filtered_docs = [] for item in collection_data: tokens = CollectionParser(item['id'], item['filter'], options['wordNetSense']).get_bow() filtered_docs.append(tokens) # handle chunk by count case if options['chunking'] == "count": chunked_words_bags = [] for bag in filtered_docs: chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(bag, options['chunk_size']) # handle chunk by breakchar string if options['chunking'] == 'breakword': chunked_words_bags = [] for bag in filtered_docs: chunked_words_bags += chunk_bag_of_word_collection_by_char_string(bag, options['breakword']) # handle no chunking if options['chunking'] == 'none': chunked_words_bags = [] for bag in filtered_docs: chunked_words_bags.append(bag) stringed_docs = [] for doc in chunked_words_bags: stringed_docs.append(" ".join([x.lower() for x in doc])) # set up and execute gensim modeling try: transformer = TfidfVectorizer() tfidf = transformer.fit_transform(stringed_docs) num_components = 2 if len(stringed_docs) < 2: num_components = 1 svd = TruncatedSVD(n_components=num_components) lsa = svd.fit_transform(tfidf.T) terms = kClosestTerms(15, options['search_query'], transformer, lsa) except Exception as e: print e terms = ["No results found for search"] LsiResult( user=user, results=json.dumps(terms), query_term=options['search_query'] ).save() result = LsiResult.objects.last() collections = [CorpusItemCollection.objects.get(pk=c.get('id')) for c in collection_data] for collection in collections: result.collections.add(collection) result.save()