def train_models_for_resources(data_type, resources, resource_lang_csv=None, csv_data_file_path=None): resources_names_list = [] if data_type == 'db': if resources is None: resources_all = Resources.select(Resources.resource).iterator() resources_names_list = [ i.__data__['resource'] for i in resources_all ] else: resources_names_list = [resources] elif data_type == 'csv': resources_names_list = [resources] if len(resources_names_list) == 0: raise Exception( "Resources not defined. Set -r <resource> or --resource <resource> variable." ) for resource_name in resources_names_list: LDAMWHandler().train(data_type=data_type, resource=resource_name, res_lang=resource_lang_csv, csv_data_file_path=csv_data_file_path) my_print("{}Train finished.\n".format(SUCCESS_FLAG))
def save_topics_to_csv(save_to_file_path, df): try: df.to_csv(save_to_file_path) my_print("{} Topics saved to [ {} ]".format(SUCCESS_FLAG, save_to_file_path)) except Exception as e: my_print("{} Cant save topics to [ {} ]".format( ERROR_FLAG, save_to_file_path))
def get_subjectivity_analyzer(lang): try: sa_subj_data_file_path = 'nltk_data/sa_subjectivity.pickle' sentim_analyzer = load(DEFAULT_PROJECT_PATH + sa_subj_data_file_path) except LookupError: my_print( '{}Cannot find the sentiment analyzer you want to load.'.format( WARNING_FLAG)) my_print( '{}Training & save a new one using NaiveBayesClassifier.'.format( WARNING_FLAG)) sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True) return sentim_analyzer
def get_correlation_metric(resource, csv_data_file_path, data_type): resources_iterator = [] if data_type == 'db': if resource is None: rdata = Resources.select().iterator() resources_iterator = [elem.__data__['resource'] for elem in rdata] else: if is_resource_exists(resource): resources_iterator = [resource] else: my_print("{}Resource [ {} ] not found. Exiting ...".format( ERROR_FLAG, resource)) elif data_type == 'csv': resources_iterator = [resource] gstart_time = datetime.datetime.now() for _resource in resources_iterator: lstart_time = datetime.datetime.now() _get_correlation_metric_from_resource( _resource, csv_data_file_path=csv_data_file_path, data_type=data_type) my_print("{}Correlation for [ {} ] calculated in {}".format( INFO_FLAG, _resource, datetime.datetime.now() - lstart_time)) my_print("{}Correlation for [ {} ] calculated in {}".format( INFO_FLAG, "All resources", datetime.datetime.now() - gstart_time))
def merge_topics_with_in_csv(input_file_path, save_to_file_path, df, on='id', how='outer'): try: df_in = pd.read_csv(input_file_path) if df_in.index.name is None or df_in.index.name != on: df_in.set_index([on], inplace=True) dfinal = df_in.merge(df, on=on, how=how) no_unnamed_columns = [i for i in dfinal.columns if "Unnamed" not in i] dfinal = dfinal[no_unnamed_columns] dfinal.to_csv(save_to_file_path) my_print("{} Topics saved to [ {} ]".format(SUCCESS_FLAG, save_to_file_path)) except Exception as e: my_print("{} {}".format(EXCEPTION_FLAG, e)) my_print("{} Cant save topics to [ {} ]".format( ERROR_FLAG, save_to_file_path))
def run(corr_calc, base_calc, add_calc, resource, last_added_only, data_type, csv_data_input_file_path=None, csv_data_output_file_path=None): if data_type == 'db': articles = get_articles_from_db(resource_id=resource, last_added_only=last_added_only) elif data_type == 'csv': articles = get_articles_from_csv( resource=resource, csv_file_path=csv_data_input_file_path) else: raise Exception("Cant read data <articles>. Exiting ...") articles_simple, articles_additional, articles_for_data_params_simple, articles_for_data_params_additional = tee( articles, 4) if base_calc: my_print( "{}Going to calculate simple parameters for [ {} ] ...".format( INFO_FLAG, resource if resource is not None else "All resources")) time_start = datetime.now() # extracted_simple_parameters = (process_for_simple(a_data) for a_data in articles_for_data_params_simple) if data_type == 'db': n_cores = multiprocessing.cpu_count() pool = multiprocessing.Pool(n_cores) extracted_simple_parameters = pool.imap( process_for_simple, articles_for_data_params_simple) del pool for art, ltc_params in zip(articles_simple, extracted_simple_parameters): for param in ltc_params: if param is not None: save_parameters(art, param) elif data_type == 'csv': n_cores = multiprocessing.cpu_count() pool = multiprocessing.Pool(n_cores) extracted_simple_parameters = pool.map( process_for_simple, articles_for_data_params_simple) # extracted_simple_parameters = (process_for_simple(i) for i in articles_for_data_params_simple) if csv_data_output_file_path is not None: output_file = csv_data_output_file_path else: output_file = "{}_simple_parameters.csv".format(resource) csv_resource_simple_parameters_df = pd.DataFrame() pool = Pool() a_tmps = (pool.map( _process_csv_pool, zip(articles_simple, extracted_simple_parameters))) for tmp in a_tmps: csv_resource_simple_parameters_df = csv_resource_simple_parameters_df.append( [tmp]) del pool # for art, ltc_params in zip(articles_simple, extracted_simple_parameters): # a_id = art[-1] # # p00 = ltc_params[0] # p11 = ltc_params[1] # p22 = ltc_params[2] # # p00_d = {'element_type_{}_'.format(p00['element_type']) + k: v for k, v in p00.items()} # p11_d = {'element_type_{}_'.format(p11['element_type']) + k: v for k, v in p11.items()} # p22_d = {'element_type_{}_'.format(p22['element_type']) + k: v for k, v in p22.items()} # # tmp = {'id': a_id, **p00_d, **p11_d, **p22_d} # # csv_resource_simple_parameters_df = csv_resource_simple_parameters_df.append([tmp]) csv_resource_simple_parameters_df.set_index(['id'], inplace=True) merge_parameters_with_in_csv(csv_data_input_file_path, output_file, csv_resource_simple_parameters_df) my_print( "{}Resources: [ {} ]; Simple parameters calculated in {}".format( SUCCESS_FLAG, resource if resource is not None else "All resources", datetime.now() - time_start)) if corr_calc: my_print( "{}Going to calculate articles parameters correlation for [ {} ] ..." .format(INFO_FLAG, resource if resource is not None else "All resources")) time_start = datetime.now() get_correlation_metric(resource, csv_data_file_path=csv_data_input_file_path, data_type=data_type) my_print("{}Resources: [ {} ]; Correlation calculated in {}".format( SUCCESS_FLAG, resource if resource is not None else "All resources", datetime.now() - time_start)) if add_calc: my_print( "{}Going to calculate additional parameters for [ {} ] ...".format( INFO_FLAG, resource if resource is not None else "All resources")) time_start = datetime.now() if data_type == 'db': extracted_additional_parameters = pool.imap( process_for_additional, articles_for_data_params_additional) for art, params in zip(articles_additional, extracted_additional_parameters): if params is not None: save_additional_parameters(params, art) elif data_type == 'csv': extracted_additional_parameters = ( process_for_additional(i) for i in articles_for_data_params_additional) if csv_data_output_file_path is not None: output_file = csv_data_output_file_path else: output_file = "{}_additional_parameters.csv".format(resource) csv_resource_additional_parameters_df = pd.DataFrame( ) #columns=['id', "n_title_symbols", "n_title_numbers", "n_title_letters", "n_title_words", "n_title_mean_letters_in_words", "title_words_diff_emotions", "title_angry", "title_anticipation", "title_disgust", "title_fear", "title_joy", "title_sadness", "title_surprise", "title_trust", "title_neg", "title_pos", "most_frequent_title_word_len", "most_frequent_title_word_count", "title_max_term_length", "n_lead_symbols", "n_lead_numbers", "n_lead_letters", "n_lead_words", "n_lead_mean_letters_in_words", "lead_words_diff_emotions", "lead_angry", "lead_anticipation", "lead_disgust", "lead_fear", "lead_joy", "lead_sadness", "lead_surprise", "lead_trust", "lead_neg", "lead_pos", "most_frequent_lead_word_len", "most_frequent_lead_word_count", "lead_max_term_length", "content_dots_count", "content_commas_count", "content_exclamation_marks_count", "content_question_marks_count", "n_content_symbols", "n_content_numbers", "n_content_letters", "n_content_words", "n_content_mean_letters_in_words", "content_mean_words_count", "content_sentences_count", "max_count_words_in_sent_content", "min_count_words_in_sent_content", "content_total_words_count", "max_freq_of_term_in_content", "min_freq_of_term_in_content", "max_term_length_content", "content_sum_emotionality", "content_mean_emotionality", "content_max_emotionality_sentences", "content_min_emotionality_sentences", "content_mean_emo_of_sentences", "content_angry", "content_anticipation", "content_disgust", "content_fear", "content_joy", "content_sadness", "content_surprise", "content_trust", "content_neg", "content_pos", "title_uniq_wd", "title_complx", "title_snt_len", "title_syll_ct", "title_flesch", "lead_uniq_wd", "lead_complx", "lead_snt_len", "lead_syll_ct", "lead_flesch", "content_ari", "title_ari", "lead_ari", "content_coleman", "content_db1", "content_db2", "content_db_grade", "content_ds", "content_herdan", "content_cttr", "content_hdd", "content_yueles_k", "content_maas_1", "content_mtld", "content_rld", "content_sld", "content_ttr", "title_ttr", "lead_ttr", "content_count_of_types", "content_count_of_tokens", "title_count_of_types", "title_count_of_tokens", "lead_count_of_types", "lead_count_of_tokens", "content_uber", "content_growth_vocabl"]) for art, params in zip(articles_additional, extracted_additional_parameters): a_id = art[-1] tmp = {'id': a_id, **params} csv_resource_additional_parameters_df = csv_resource_additional_parameters_df.append( [tmp]) csv_resource_additional_parameters_df.set_index('id', inplace=True) merge_parameters_with_in_csv( csv_data_input_file_path, output_file, csv_resource_additional_parameters_df) my_print("{}Additional parameters for {} calculated in {}".format( SUCCESS_FLAG, resource if resource is not None else "All resources", datetime.now() - time_start))
def __call__(self, text): try: words = nltk.tokenize.word_tokenize(text) pos_valuences = list( filter(lambda x: x > 0, [ self.analyzer.lexicon.get(word.lower(), 0) for word in words ])) neg_valuences = list( filter(lambda x: x < 0, [ self.analyzer.lexicon.get(word.lower(), 0) for word in words ])) nwords, npos, nneg = len(words), len(pos_valuences), len( neg_valuences) polarity = self.analyzer.polarity_scores(text) return ( ('global_negative_polarity', polarity['neg']), ('global_positive_polarity', polarity['pos']), ('global_neutral_polarity', polarity['neu']), ('global_sentiment_polarity', polarity['compound']), ('global_rate_positive_words', self._sigmoid(npos / nwords) if nwords else self._sigmoid(npos)), ('global_rate_negative_words', self._sigmoid(nneg / nwords) if nwords else self._sigmoid(nneg)), ('rate_positive_words', self._sigmoid(npos / (npos + nneg)) if npos or nneg else self._sigmoid(npos)), ('rate_negative_words', self._sigmoid(nneg / (npos + nneg)) if npos or nneg else self._sigmoid(nneg)), ('avg_positive_polarity', self._sigmoid(sum(pos_valuences) / npos) if npos else self._sigmoid(sum(pos_valuences))), ('min_positive_polarity', self._sigmoid(min(pos_valuences)) if pos_valuences else 0.0), ('max_positive_polarity', self._sigmoid(max(pos_valuences)) if pos_valuences else 0.0), ('avg_negative_polarity', self._sigmoid(sum(neg_valuences) / nneg) if nneg else self._sigmoid(sum(neg_valuences))), ('min_negative_polarity', self._sigmoid(min(neg_valuences)) if neg_valuences else 0.0), ('max_negative_polarity', self._sigmoid(max(neg_valuences)) if neg_valuences else 0.0), ) except Exception as e: my_print("{}{}".format(EXCEPTION_FLAG, e)) return ( ('global_negative_polarity', 0), ('global_positive_polarity', 0), ('global_neutral_polarity', 0), ('global_sentiment_polarity', 0), ('global_rate_positive_words', 0), ('global_rate_negative_words', 0), ('rate_positive_words', 0), ('rate_negative_words', 0), ('avg_positive_polarity', 0), ('min_positive_polarity', 0), ('max_positive_polarity', 0), ('avg_negative_polarity', 0), ('min_negative_polarity', 0), ('max_negative_polarity', 0), )
def run(resource=None, period=None, last_added_only=False, data_type=None, csv_data_input_file_path=None, csv_data_output_file_path=None): gtime_start = datetime.datetime.now() if data_type == 'db': if resource is None: rdata = Resources.select().iterator() resources_iterator = [elem.__data__['resource'] for elem in rdata] else: if is_resource_exists(resource): resources_iterator = [resource] else: my_print("{}Resource [ {} ] not found. Exiting ...".format( ERROR_FLAG, resource)) ps_resources = (get_articles_from_db(resource_id=r_id, period=period, last_added_only=last_added_only) for r_id in resources_iterator) for ps, res in zip(ps_resources, resources_iterator): ltime_start = datetime.datetime.now() ps, articles_s, data = tee(ps, 3) # check content; process if it's not too short or empty data = ((p.lead, p.title, p.content, res) for p in data if (is_text(p, None)[0])) pool = Pool() params = pool.map(topic_ltc_by_resource, data) for art, prms in zip(articles_s, params): for par in prms: if par is not None: save_parameters(par, art) my_print("{} [ {} ] :: LDA topics calculated in {}".format( SUCCESS_FLAG, res, datetime.datetime.now() - ltime_start)) del pool if len(resources_iterator) != 1: my_print("{}{} :: calculated in {}".format( SUCCESS_FLAG, " ".join(resources_iterator), datetime.datetime.now() - gtime_start)) elif data_type == 'csv': if resource is None: raise Exception("Resource cant be undefined for csv data_type.") ps_csv_resource = get_articles_from_csv(resource, csv_data_input_file_path) ltime_start = datetime.datetime.now() ps, articles_s, data = tee(ps_csv_resource, 3) if csv_data_output_file_path is not None: output_file = csv_data_output_file_path else: output_file = "{}_topics.csv".format(resource) csv_resource_topics_df = pd.DataFrame() pool = Pool() a_tmps = (pool.map(_process_csv_pool, data)) for tmp in a_tmps: csv_resource_topics_df = csv_resource_topics_df.append([tmp]) del pool csv_resource_topics_df.set_index('id', inplace=True) my_print("{} [ {} ] :: LDA topics calculated in {}".format( SUCCESS_FLAG, resource, datetime.datetime.now() - ltime_start)) # save_topics_to_csv(output_file, csv_resource_topics_df) merge_topics_with_in_csv(csv_data_input_file_path, output_file, csv_resource_topics_df) else: pass