def deserialize(self, type, name, language='en'): serializer = self.serializers[type] if type != "lda_model": with codecs.open(name, "r", encoding = "utf-8") as f: data = json.load(f) elif type == "lda_model": with open(name, "rb") as f: data = pickle.load(f) deserialized = serializer(data).deserialize() if type == "phrases": if language == 'en': common_terms = self.function_words_single else: common_terms = safe_get_stop_words(language) phrases = Phrases(delimiter="_", connector_words=common_terms) phrases.phrasegrams = deserialized deserialized = phrases return deserialized
def remove_stop_words(string, language): tokens = string.split() clean_tokens = [ token for token in tokens if token not in safe_get_stop_words(language) ] return u' '.join(clean_tokens)
def get_document_json(url): """ Parameters ------------- url: str url of the document to be parsed. Returns ------------- dict: document data. """ article = Article(url) article.download() article.parse() article.nlp() if article.publish_date is None or isinstance(article.publish_date, str): date = None else: date = article.publish_date.strftime('%Y-%m-%d') if article.meta_lang != None and article.meta_lang != '': stopwords = safe_get_stop_words(article.meta_lang) keywords = [i for i in article.keywords if i not in stopwords] else: keywords = article.keywords keywords = list(set([slugify(i) for i in keywords])) json = { 'title': article.title, 'authors': article.authors, 'created_on': date, 'language': article.meta_lang, 'keywords': keywords, 'url': url, } return json
def make_word_cloud(df, ngram_min, ngram_max, name): my_path = os.path.abspath(os.path.dirname(__file__)) + '/static/wordcloud/' stop_words = safe_get_stop_words('en') filenames = [] for ind, row in df.iterrows(): data = row['review_text'] num_words = 200 ngram_range = (ngram_min, ngram_max) count_vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words, ngram_range=ngram_range) counts = count_vectorizer.fit_transform(data) counts = counts.toarray().sum(axis=0) count_weighting = dict( zip(count_vectorizer.get_feature_names(), counts)) count_weighting_df = pd.DataFrame.from_dict(count_weighting, orient='index') count_weighting_df = count_weighting_df.reset_index(drop=False) count_weighting_df.columns = ['word', 'count'] count_weighting_df = count_weighting_df.sort_values(['count'], ascending=False) count_weighting_df = count_weighting_df.set_index('word') word_cloud_freq = count_weighting_df['count'].head(num_words).to_dict() wordcloud = WordCloud( collocations=False).generate_from_frequencies(word_cloud_freq) plotname = '{}_{}.png'.format(name, ind + 1) filenames.append(plotname) url = my_path + plotname fig = plt.figure(figsize=(10, 10)) plt.imshow(wordcloud, cmap=plt.cm.bone, interpolation='bilinear') plt.axis("off") fig.savefig(url, transparent=True, bbox_inches='tight', pad_inches=0) return filenames
def test_random_language_stop_words_load(self): languages = list(LANGUAGE_MAPPING.keys()) + list(AVAILABLE_LANGUAGES) sample = random.sample(languages, len(languages)) for language in sample: stop_words = safe_get_stop_words(language) self.assertTrue( len(stop_words) > 0, 'Cannot load stopwords for {0} language'.format(language))
def test_random_language_stop_words_load(self): languages = list(LANGUAGE_MAPPING.keys()) + list(AVAILABLE_LANGUAGES) sample = random.sample(languages, len(languages)) for language in sample: stop_words = safe_get_stop_words(language) self.assertTrue( len(stop_words) > 0, 'Cannot load stopwords for {0} language'.format(language) )
def suitable_complex_word(w): """Checks if detected word is suitable for replacing.""" # Not stopword or punctuation. not_stopword = w not in safe_get_stop_words(config.lang) and w.isalpha() # Not a simple word (above defined threshold). not_simple = zfreq(w, config.lang) < config.min_complexity # No uppercase (ensures NEs are not simplified). not_uppercase = w.islower() return not_stopword and not_simple and not_uppercase
def get_stopwords_by_lang(language): while True: try: stopwords = set(nltk.corpus.stopwords.words(language)) return stopwords except LookupError as error: resource = re.search("nltk\\.download\\('(.+?)'\\)", str(error)).group(1) print(f'Downloading missing resource [{resource}]') nltk.download(resource) except IOError as error: stopwords = stop_words.safe_get_stop_words(language) return stopwords
def assemble_stopwords(languages = ['english'], user_defined = []): ''' Supported Languages Arabic Catalan Danish Dutch English Finnish French German Hungarian Italian Norwegian Portuguese Romanian Russian Spanish Swedish Turkish Ukrainian ''' sw = [] if len(user_defined) > 0: sw += user_defined for i in languages: sw += safe_get_stop_words(i) return set(sw)
def remove_stopwords(text, text_language): tweet_words = text.lower().split() stop_words = safe_get_stop_words( languages.get(part1=text_language).name.lower()) + [ "ul", "b", "v", "a", "z", "li", "o", "s", "k", "i", "se", "u003e", "u003c", "u", "href", "u003cli", "u003ca", "u003cul", "u003cb", "httpslinkcomdate", "u003edalší", "n" ] words = "" for word in tweet_words: if word not in stop_words: words = words + (" " + word) return words
def _build_phrases(self, df, min_count = 1, language='en'): if language == 'en': common_terms = self.function_words_single else: common_terms = safe_get_stop_words(language) phrases = Phrases( sentences=stream_clean(df), min_count=min_count, threshold=0.70, scoring="npmi", max_vocab_size=20000000, delimiter="_", connector_words=common_terms ) self.phrases = phrases
def extract_dictionary_feature(self, check_str): """ TODO * check that the string is actually a word (may not be necessary with different tokenisation) Example: >>> tests = [u'Hom.',u'Homér'] >>> fe = FeatureExtractor() >>> [(tests[n],fe.feat_labels[fe.extract_dictionary_feature(t)[1]]) for n,t in enumerate(tests)] """ feature_name = "n_works_dictionary" # compile a list of stopwords for all relevant languages languages = ["it", "de", "fr", "en", "es"] stopwords = [] for lang in languages: stopwords += safe_get_stop_words(lang) if len(check_str) <= 2 or check_str.lower() in stopwords: # don't output dictionary feature for stopwords! return (feature_name, self.OTHERS) match_works = self.works_dict.lookup(check_str.encode("utf-8")) match_authors = self.authors_dict.lookup(check_str.encode("utf-8")) #result = (feature_name, self.OTHERS) if (len(match_authors) > 0): for key in match_authors: if (len(match_authors[key]) == len(check_str)): result = (feature_name, self.MATCH_AUTHORS_DICT) else: result = (feature_name, self.CONTAINED_AUTHORS_DICT) elif (len(match_works) > 0): for key in match_works: if (len(match_works[key]) == len(check_str)): result = (feature_name, self.MATCH_WORKS_DICT) else: result = (feature_name, self.CONTAINED_WORKS_DICT) else: result = (feature_name, self.OTHERS) return result
def extract_dictionary_feature(self, check_str): """ TODO * check that the string is actually a word (may not be necessary with different tokenisation) Example: >>> tests = [u'Hom.',u'Homér'] >>> fe = FeatureExtractor() >>> [(tests[n],fe.feat_labels[fe.extract_dictionary_feature(t)[1]]) for n,t in enumerate(tests)] """ feature_name = "n_works_dictionary" # compile a list of stopwords for all relevant languages languages = ["it", "de", "fr", "en", "es"] stopwords = [] for lang in languages: stopwords += safe_get_stop_words(lang) if len(check_str) <= 2 or check_str.lower() in stopwords: # don't output dictionary feature for stopwords! return (feature_name, self.OTHERS) match_works = self.works_dict.lookup(check_str.encode("utf-8")) match_authors = self.authors_dict.lookup(check_str.encode("utf-8")) #result = (feature_name, self.OTHERS) if(len(match_authors) > 0): for key in match_authors: if(len(match_authors[key]) == len(check_str)): result = (feature_name, self.MATCH_AUTHORS_DICT) else: result = (feature_name, self.CONTAINED_AUTHORS_DICT) elif(len(match_works) > 0): for key in match_works: if(len(match_works[key]) == len(check_str)): result = (feature_name, self.MATCH_WORKS_DICT) else: result = (feature_name, self.CONTAINED_WORKS_DICT) else: result = (feature_name, self.OTHERS) return result
def suitable_candidate(w, c): """Checks if candidate is a suitable substitute based on various criteria.""" source_stem = stemmer.stem(w) candidate_stem = stemmer.stem(c) # Check stem length. not_stem_len = not (len(candidate_stem) >= 3 and candidate_stem[:3] == source_stem[:3]) # Not sharing stem with original word. not_equal_stem = source_stem != candidate_stem # Not punctuation not_punctuation = c.isalpha() # Other checks (disable when benchmarking). not_morph_deriv = c not in w and w not in c not_complex = zfreq(c, config.lang) > zfreq(w, config.lang) not_stopword = c not in safe_get_stop_words(config.lang) and c.isalpha() return not_equal_stem and not_stem_len and not_morph_deriv and not_stopword and not_complex
def get_document_json(post): """ Parameters ------------- post: dict post data. Returns ------------- dict: document data. """ try: article = Article(post['url']) article.download() article.parse() article.nlp() if article.publish_date is None or isinstance(article.publish_date, str): date = None else: date = article.publish_date.strftime('%Y-%m-%d') if article.meta_lang != None and article.meta_lang != '': stopwords = safe_get_stop_words(article.meta_lang) keywords = [i for i in article.keywords if i not in stopwords] else: keywords = article.keywords keywords = list(set([slugify(i) for i in keywords])) json = { 'title': article.title, 'authors': article.authors, 'created_on': date, 'language': article.meta_lang, 'keywords': keywords, 'url': post['url'], } if article.has_top_image() and post['image'] == MISSING_IMAGE: post['image'] = article.top_image except ArticleException: json = { 'url': post['url'] } return json
def get_association(self, df, min_count=1, threshold=0.70, save_phraser=False, language='en'): cxg = C2xG(language = self.settings.MAP_THREE[language]) association_df = cxg.get_association(self.read(df), freq_threshold = min_count, smoothing = False, lex_only = True) if save_phraser == True: if language == 'en': common_terms = self.function_words_single else: common_terms = safe_get_stop_words(language) phrasegrams = {} for row in association_df.itertuples(): word = row[1] + "_" + row[2] if row[3] > threshold: phrasegrams[word] = row[3] phrases = Phrases(delimiter="_", connector_words=common_terms, min_count=min_count, threshold=threshold) phrases.phrasegrams = phrasegrams self.phrases = phrases return association_df
def senti_values_csv(min_words): twitter_data = import_csv("ItalianTweets.csv") tweet_list = get_column(twitter_data, 6) tweet_sentiment = get_column(twitter_data, 2) size = len(tweet_list) for tweet in range(0, size): tweet_list[tweet] = del_twitter_words(tweet_list[tweet]) tweet_list[tweet] = del_characters(tweet_list[tweet], '"!@#$%^&*()_-+=1234567890?<>|[]{}\/') # senti_dict = {} stopwords = stop_words.safe_get_stop_words('it') senti_words = create_list(tweet_list) senti_words = remove_duplicates(stopwords, senti_words, min_words) # print(len(senti_words)) senti_words = translations(senti_words) senti_list = get_senti_values(senti_words) with open('Senti_Values.csv', 'wt') as senti_file: wr = csv.writer(senti_file, lineterminator = '\n', quoting=csv.QUOTE_ALL) wr.writerow(['Word', 'Pos', 'Neg']) length = len(senti_list) for pos in range(0, length): wr.writerow(senti_list[pos])
def test_safe_get_stop_words(self): self.assertRaises(StopWordError, get_stop_words, 'huttese') self.assertEqual(safe_get_stop_words('huttese'), [])
def main(pieces, strength, min_words): twitter_data = import_csv("ItalianTweets.csv") tweet_list = get_column(twitter_data, 6) tweet_sentiment = get_column(twitter_data, 2) size = len(tweet_list) for tweet in range(0, size): tweet_list[tweet] = del_twitter_words(tweet_list[tweet]) tweet_list[tweet] = del_characters(tweet_list[tweet], '"!@#$%^&*()_-+=1234567890?<>|[]{}\/') # senti_dict = {} stopwords = stop_words.safe_get_stop_words('it') senti_words = create_list(tweet_list) senti_words = remove_duplicates(stopwords, senti_words, min_words) senti_words = translations(senti_words) senti_dict = get_senti_values(senti_words) # pieces = 5 train_and_test_data = randomize_tweets(tweet_list, tweet_sentiment, pieces) # total_data = [(1*strength) for x in range(width)] results = open('results.txt', 'w') results.write('Pieces = ' + str(pieces) + '\n') results.write('Min Words = ' + str(min_words) + '\n\n\n') results.close() for value in range(5, strength+1): final_train_results = [[] for x in range(5)] final_nb_results = [[] for x in range(5)] final_senti_results = [[] for x in range(5)] for piece in range(0, pieces): print(piece) train_data = [] train_labels = [] test_data = train_and_test_data[piece*2] test_labels = train_and_test_data[piece*2+1] for y in range(0,pieces): if piece != y: train_data.extend(train_and_test_data[y*2]) train_labels.extend(train_and_test_data[y*2+1]) train_words = create_list(train_data) train_words = remove_duplicates(stopwords, train_words, min_words) results = run_naive_bayes(train_data, train_labels, test_data, test_labels, train_words, stopwords, senti_dict, value) for x in range(0,5): final_train_results[x].append(results[0][x]) final_nb_results[x].append(results[1][x]) final_senti_results[x].append(results[2][x]) accuracy = [0,0,0,0] positive_precision = [0,0,0,0] negative_precision = [0,0,0,0] positive_recall = [0,0,0,0] negative_recall = [0,0,0,0] for x in range(0,pieces): accuracy[0] += final_nb_results[0][x] accuracy[2] += final_senti_results[0][x] positive_precision[0] += final_nb_results[1][x] positive_precision[2] += final_senti_results[1][x] negative_precision[0] += final_nb_results[2][x] negative_precision[2] += final_senti_results[2][x] positive_recall[0] += final_nb_results[3][x] positive_recall[2] += final_senti_results[3][x] negative_recall[0] += final_nb_results[4][x] negative_recall[2] += final_senti_results[4][x] accuracy[0] /= float(pieces) accuracy[2] /= float(pieces) positive_precision[0] /= float(pieces) positive_precision[2] /= float(pieces) negative_precision[0] /= float(pieces) negative_precision[2] /= float(pieces) positive_recall[0] /= float(pieces) positive_recall[2] /= float(pieces) negative_recall[0] /= float(pieces) negative_recall[2] /= float(pieces) for y in range(0,pieces): accuracy[1] += (accuracy[0] - final_nb_results[0][y]) ** 2 accuracy[3] += (accuracy[2] - final_senti_results[0][y]) ** 2 positive_precision[1] += (positive_precision[0] - final_nb_results[1][y]) ** 2 positive_precision[3] += (positive_precision[2] - final_senti_results[1][y]) ** 2 negative_precision[1] += (negative_precision[0] - final_nb_results[2][y]) ** 2 negative_precision[3] += (negative_precision[2] - final_senti_results[2][y]) ** 2 positive_recall[1] += (positive_recall[0] - final_nb_results[3][y]) ** 2 positive_recall[3] += (positive_recall[2] - final_senti_results[3][y]) ** 2 negative_recall[1] += (negative_recall[0] - final_nb_results[4][y]) ** 2 negative_recall[3] += (negative_recall[2] - final_senti_results[4][y]) ** 2 accuracy[1] = (accuracy[1]/float(pieces)) ** 0.5 accuracy[3] = (accuracy[3]/float(pieces)) ** 0.5 positive_precision[1] = (positive_precision[1]/float(pieces)) ** 0.5 positive_precision[3] = (positive_precision[3]/float(pieces)) ** 0.5 negative_precision[1] = (negative_precision[1]/float(pieces)) ** 0.5 negative_precision[3] = (negative_precision[3]/float(pieces)) ** 0.5 positive_recall[1] = (positive_recall[1]/float(pieces)) ** 0.5 positive_recall[3] = (positive_recall[3]/float(pieces)) ** 0.5 negative_recall[1] = (negative_recall[1]/float(pieces)) ** 0.5 negative_recall[3] = (negative_recall[3]/float(pieces)) ** 0.5 # final_train_results[x] = float(final_train_results[x])/pieces # final_nb_results[x] = float(final_nb_results[x])/pieces # final_senti_results[x] = float(final_senti_results[x])/pieces # print(final_train_results[x]) # print(final_nb_results[x]) # print(final_senti_results[x]) # print() results = open('results.txt', 'a') results.write('Strength = ' + str(value) + '\n\n') # results.write('Training Data' + '\n') # results.write('Accuracy: ' + str(final_train_results[0]) + '\n') # results.write('Positive Precision: ' + str(final_train_results[1]) + '\n') # results.write('Negative Precision: ' + str(final_train_results[2]) + '\n') # results.write('Positive Recall ' + str(final_train_results[3]) + '\n') # results.write('Negative Recall ' + str(final_train_results[4]) + '\n') # results.write('\n') results.write('Naive Bayes' + '\n') results.write('Accuracy: ' + str(accuracy[0]) + ' ' + str(accuracy[1]) + '\n') results.write('Positive Precision: ' + str(positive_precision[0]) + ' ' + str(positive_precision[1]) + '\n') results.write('Negative Precision: ' + str(negative_precision[0]) + ' ' + str(negative_precision[1]) + '\n') results.write('Positive Recall ' + str(positive_recall[0]) + ' ' + str(positive_recall[1]) + '\n') results.write('Negative Recall ' + str(negative_recall[0]) + ' ' + str(negative_recall[1]) + '\n') results.write('\n') results.write('SentiWordNet' + '\n') results.write('Accuracy: ' + str(accuracy[2]) + ' ' + str(accuracy[3]) + '\n') results.write('Positive Precision: ' + str(positive_precision[2]) + ' ' + str(positive_precision[3]) + '\n') results.write('Negative Precision: ' + str(negative_precision[2]) + ' ' + str(negative_precision[3]) + '\n') results.write('Positive Recall: ' + str(positive_recall[2]) + ' ' + str(positive_recall[3]) + '\n') results.write('Negative Recall: ' + str(negative_recall[2]) + ' ' + str(negative_recall[3]) + '\n') results.write('\n\n') results.close()
json_paths = [] if arguments.config: with open(arguments.config, 'r', encoding='utf-8') as f: line = f.readline() while line: json_paths.append(line.replace('\n', '')) line = f.readline() if arguments.json_paths: for x in arguments.json_paths: json_paths.append(x) print(json_paths) p = arguments.print generate_wordcloud = arguments.word_cloud if generate_wordcloud: from stop_words import safe_get_stop_words stop_words = safe_get_stop_words('de') for word in custom_stop_words: stop_words.append(word) print('Stop words:') print(stop_words) if arguments.image: import numpy as np from PIL import Image wordcloud_mask = np.array(Image.open(arguments.image)) use_mask = True else: wordcloud_mask = None use_mask = False wordcloud_users = arguments.word_cloud_users if arguments.starting_time is not None:
def _compute_tfidf_matrix(self, base_dir=None): LOGGER.info('Computing TF-IDF matrix (base_dir={})'.format(base_dir)) tfidf_data = {} # Compute tf-idf distribution for each language for lang in LANGUAGES: lang_data = {} if not base_dir: resources_dir = 'data/wikipages/text/authors/{}'.format(lang) text_authors_dir_lang = pkg_resources.resource_filename( 'citation_extractor', resources_dir ) text_authors_files = pkg_resources.resource_listdir( 'citation_extractor', resources_dir ) else: text_authors_dir_lang = os.path.join(base_dir, lang) text_authors_files = os.listdir(text_authors_dir_lang) LOGGER.info('Computing TF-IDF matrix: using %i document for \ language %s' % (len(text_authors_files), lang)) texts = [] urn_to_index = {} index = 0 for file in text_authors_files: if not file.endswith('.txt'): continue urn = file.replace('.txt', '') filepath = os.path.join(text_authors_dir_lang, file) with open(filepath) as txt_file: text = txt_file.read() texts.append(text) urn_to_index[urn] = index index += 1 # Dictionary mapping a URN to an index (row) lang_data['urn_to_index'] = urn_to_index tfidf_vectorizer = TfidfVectorizer( input='content', strip_accents='unicode', analyzer='word', stop_words=safe_get_stop_words(lang) ) # Language-specific vectorizer lang_data['vectorizer'] = tfidf_vectorizer # Tf-idf matrix computed with the specific vectorizer tfidf_matrix = tfidf_vectorizer.fit_transform(texts) lang_data['matrix'] = tfidf_matrix tfidf_data[lang] = lang_data LOGGER.info('Done computing TF-IDF matrix.') return tfidf_data
def get_results(self, segment=None): key = PollQuestion.POLL_QUESTION_RESULTS_CACHE_KEY % (self.poll.org.pk, self.poll.pk, self.pk) if segment: substituted_segment = self.poll.org.substitute_segment(segment) key += ":" + slugify(unicode(json.dumps(substituted_segment))) cached_value = cache.get(key, None) if cached_value: return cached_value["results"] org = self.poll.org open_ended = self.is_open_ended() responded = self.get_responded() polled = self.get_polled() results = [] if open_ended and not segment: cursor = connection.cursor() custom_sql = """ SELECT w.label, count(*) AS count FROM (SELECT regexp_split_to_table(LOWER(text), E'[^[:alnum:]_]') AS label FROM polls_pollresult WHERE polls_pollresult.org_id = %d AND polls_pollresult.flow = '%s' AND polls_pollresult.ruleset = '%s') w group by w.label order by count desc; """ % (org.id, self.poll.flow_uuid, self.ruleset_uuid) cursor.execute(custom_sql) from ureport.utils import get_dict_from_cursor unclean_categories = get_dict_from_cursor(cursor) categories = [] ureport_languages = getattr(settings, 'LANGUAGES', [('en', 'English')]) org_languages = [lang[1].lower() for lang in ureport_languages if lang[0] == org.language] if 'english' not in org_languages: org_languages.append('english') ignore_words = [] for lang in org_languages: ignore_words += safe_get_stop_words(lang) categories = [] for category in unclean_categories: if len(category['label']) > 1 and category['label'] not in ignore_words and len(categories) < 100: categories.append(dict(label=category['label'], count=int(category['count']))) # sort by count, then alphabetically categories = sorted(categories, key=lambda c: (-c['count'], c['label'])) results.append(dict(open_ended=open_ended, set=responded, unset=polled-responded, categories=categories)) else: categories_label = self.response_categories.filter(is_active=True).values_list('category', flat=True) question_results = self.get_question_results() if segment: location_part = segment.get('location').lower() if location_part not in ['state', 'district']: return None location_boundaries = org.get_segment_org_boundaries(segment) for boundary in location_boundaries: categories = [] osm_id = boundary.get('osm_id').upper() set_count = 0 unset_count_key = "ruleset:%s:nocategory:%s:%s" % (self.ruleset_uuid, location_part, osm_id) unset_count = question_results.get(unset_count_key, 0) for categorie_label in categories_label: category_count_key = "ruleset:%s:category:%s:%s:%s" % (self.ruleset_uuid, categorie_label.lower(), location_part, osm_id) category_count = question_results.get(category_count_key, 0) set_count += category_count categories.append(dict(count=category_count, label=categorie_label)) if open_ended: # For home page best and worst location responses from ureport.contacts.models import Contact if segment.get('location') == 'District': boundary_contacts_count = Contact.objects.filter(org=org, district=osm_id).count() else: boundary_contacts_count = Contact.objects.filter(org=org, state=osm_id).count() unset_count = boundary_contacts_count - set_count results.append(dict(open_ended=open_ended, set=set_count, unset=unset_count, boundary=osm_id, label=boundary.get('name'), categories=categories)) else: categories = [] for categorie_label in categories_label: category_count_key = "ruleset:%s:category:%s" % (self.ruleset_uuid, categorie_label.lower()) if categorie_label.lower() != 'other': category_count = question_results.get(category_count_key, 0) categories.append(dict(count=category_count, label=categorie_label)) results.append(dict(open_ended=open_ended, set=responded, unset=polled-responded, categories=categories)) cache.set(key, {"results": results}, PollQuestion.POLL_QUESTION_RESULTS_CACHE_TIMEOUT) return results
def __init__(self): logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__name__) self.stop_words = safe_get_stop_words('german') print(self.stop_words)
def get_value_summary(cls, ruleset=None, contact_field=None, filters=None, segment=None): """ Returns the results for the passed in ruleset or contact field given the passed in filters and segments. Filters are expected in the following formats: { field: rulesetId, categories: ["Red", "Blue", "Yellow"] } Segments are expected in these formats instead: { ruleset: 1515, categories: ["Red", "Blue"] } // segmenting by another field, for those categories { groups: 124,151,151 } // segment by each each group in the passed in ids { location: "State", parent: null } // segment for each admin boundary within the parent { contact_field: "Country", values: ["US", "EN", "RW"] } // segment by a contact field for these values """ from temba.contacts.models import ContactGroup, ContactField from temba.flows.models import TrueTest, RuleSet start = time.time() results = [] if (not ruleset and not contact_field) or (ruleset and contact_field): raise ValueError("Must specify either a RuleSet or Contact field.") org = ruleset.flow.org if ruleset else contact_field.org open_ended = ruleset and ruleset.ruleset_type == RuleSet.TYPE_WAIT_MESSAGE and len(ruleset.get_rules()) == 1 # default our filters to an empty list if None are passed in if filters is None: filters = [] # build the kwargs for our subcall kwargs = dict(ruleset=ruleset, contact_field=contact_field, filters=filters) # this is our list of dependencies, that is things that will blow away our results dependencies = set() fingerprint_dict = dict(filters=filters, segment=segment) if ruleset: fingerprint_dict['ruleset'] = ruleset.id dependencies.add(RULESET_KEY % ruleset.id) if contact_field: fingerprint_dict['contact_field'] = contact_field.id dependencies.add(CONTACT_KEY % contact_field.id) for contact_filter in filters: if 'ruleset' in contact_filter: dependencies.add(RULESET_KEY % contact_filter['ruleset']) if 'groups' in contact_filter: for group_id in contact_filter['groups']: dependencies.add(GROUP_KEY % group_id) if 'location' in contact_filter: field = ContactField.get_by_label(org, contact_filter['location']) dependencies.add(CONTACT_KEY % field.id) if segment: if 'ruleset' in segment: dependencies.add(RULESET_KEY % segment['ruleset']) if 'groups' in segment: for group_id in segment['groups']: dependencies.add(GROUP_KEY % group_id) if 'location' in segment: field = ContactField.get_by_label(org, segment['location']) dependencies.add(CONTACT_KEY % field.id) # our final redis key will contain each dependency as well as a HASH representing the fingerprint of the # kwargs passed to this method, generate that hash fingerprint = hash(dict_to_json(fingerprint_dict)) # generate our key key = VALUE_SUMMARY_CACHE_KEY + ":" + str(org.id) + ":".join(sorted(list(dependencies))) + ":" + str(fingerprint) # does our value exist? r = get_redis_connection() cached = r.get(key) if cached is not None: try: return json_to_dict(cached) except Exception: # failed decoding, oh well, go calculate it instead pass if segment: # segmenting a result is the same as calculating the result with the addition of each # category as a filter so we expand upon the passed in filters to do this if 'ruleset' in segment and 'categories' in segment: for category in segment['categories']: category_filter = list(filters) category_filter.append(dict(ruleset=segment['ruleset'], categories=[category])) # calculate our results for this segment kwargs['filters'] = category_filter (set_count, unset_count, categories) = cls.get_filtered_value_summary(**kwargs) results.append(dict(label=category, open_ended=open_ended, set=set_count, unset=unset_count, categories=categories)) # segmenting by groups instead, same principle but we add group filters elif 'groups' in segment: for group_id in segment['groups']: # load our group group = ContactGroup.user_groups.get(org=org, pk=group_id) category_filter = list(filters) category_filter.append(dict(groups=[group_id])) # calculate our results for this segment kwargs['filters'] = category_filter (set_count, unset_count, categories) = cls.get_filtered_value_summary(**kwargs) results.append(dict(label=group.name, open_ended=open_ended, set=set_count, unset_count=unset_count, categories=categories)) # segmenting by a contact field, only for passed in categories elif 'contact_field' in segment and 'values' in segment: # look up the contact field field = ContactField.get_by_label(org, segment['contact_field']) for value in segment['values']: value_filter = list(filters) value_filter.append(dict(contact_field=field.pk, values=[value])) # calculate our results for this segment kwargs['filters'] = value_filter (set_count, unset_count, categories) = cls.get_filtered_value_summary(**kwargs) results.append(dict(label=value, open_ended=open_ended, set=set_count, unset=unset_count, categories=categories)) # segmenting by a location field elif 'location' in segment: # look up the contact field field = ContactField.get_by_label(org, segment['location']) # make sure they are segmenting on a location type that makes sense if field.value_type not in [Value.TYPE_STATE, Value.TYPE_DISTRICT, Value.TYPE_WARD]: raise ValueError(_("Cannot segment on location for field that is not a State or District type")) # make sure our org has a country for location based responses if not org.country: raise ValueError(_("Cannot segment by location until country has been selected for organization")) # the boundaries we will segment by parent = org.country # figure out our parent parent_osm_id = segment.get('parent', None) if parent_osm_id: parent = AdminBoundary.objects.get(osm_id=parent_osm_id) # get all the boundaries we are segmenting on boundaries = list(AdminBoundary.objects.filter(parent=parent).order_by('name')) # if the field is a district field, they need to specify the parent state if not parent_osm_id and field.value_type == Value.TYPE_DISTRICT: raise ValueError(_("You must specify a parent state to segment results by district")) if not parent_osm_id and field.value_type == Value.TYPE_WARD: raise ValueError(_("You must specify a parent state to segment results by ward")) # if this is a district, we can speed things up by only including those districts in our parent, build # the filter for that if parent and field.value_type in [Value.TYPE_DISTRICT, Value.TYPE_WARD]: location_filters = [filters, dict(location=field.pk, boundary=[b.osm_id for b in boundaries])] else: location_filters = filters # get all the contacts segment by location first (location_set_contacts, location_unset_contacts, location_results) = \ cls.get_filtered_value_summary(contact_field=field, filters=location_filters, return_contacts=True) # now get the contacts for our primary query kwargs['return_contacts'] = True kwargs['filter_contacts'] = location_set_contacts (primary_set_contacts, primary_unset_contacts, primary_results) = cls.get_filtered_value_summary(**kwargs) # build a map of osm_id to location_result osm_results = {lr['label']: lr for lr in location_results} empty_result = dict(contacts=list()) for boundary in boundaries: location_result = osm_results.get(boundary.osm_id, empty_result) # clone our primary results segmented_results = dict(label=boundary.name, boundary=boundary.osm_id, open_ended=open_ended) location_categories = list() location_contacts = set(location_result['contacts']) for category in primary_results: category_contacts = set(category['contacts']) intersection = location_contacts & category_contacts location_categories.append(dict(label=category['label'], count=len(intersection))) segmented_results['set'] = len(location_contacts & primary_set_contacts) segmented_results['unset'] = len(location_contacts & primary_unset_contacts) segmented_results['categories'] = location_categories results.append(segmented_results) results = sorted(results, key=lambda r: r['label']) else: (set_count, unset_count, categories) = cls.get_filtered_value_summary(**kwargs) # Check we have and we have an OPEN ENDED ruleset if ruleset and len(ruleset.get_rules()) == 1 and isinstance(ruleset.get_rules()[0].test, TrueTest): cursor = connection.cursor() custom_sql = """SELECT w.label, count(*) AS count FROM ( SELECT regexp_split_to_table(LOWER(text), E'[^[:alnum:]_]') AS label FROM msgs_msg INNER JOIN contacts_contact ON ( msgs_msg.contact_id = contacts_contact.id ) WHERE msgs_msg.id IN ( SELECT msg_id FROM flows_flowstep_messages, flows_flowstep WHERE flowstep_id = flows_flowstep.id AND flows_flowstep.step_uuid = '%s' ) AND contacts_contact.is_test = False ) w group by w.label order by count desc;""" % ruleset.uuid cursor.execute(custom_sql) unclean_categories = get_dict_from_cursor(cursor) categories = [] org_languages = [lang.name.lower() for lang in org.languages.filter(orgs=None).distinct()] if 'english' not in org_languages: org_languages.append('english') ignore_words = [] for lang in org_languages: ignore_words += safe_get_stop_words(lang) for category in unclean_categories: if len(category['label']) > 1 and category['label'] not in ignore_words and len(categories) < 100: categories.append(dict(label=category['label'], count=int(category['count']))) # sort by count, then alphabetically categories = sorted(categories, key=lambda c: (-c['count'], c['label'])) results.append(dict(label=unicode(_("All")), open_ended=open_ended, set=set_count, unset=unset_count, categories=categories)) # for each of our dependencies, add our key as something that depends on it pipe = r.pipeline() for dependency in dependencies: pipe.sadd(dependency, key) pipe.expire(dependency, VALUE_SUMMARY_CACHE_TIME) # and finally set our result pipe.set(key, dict_to_json(results), VALUE_SUMMARY_CACHE_TIME) pipe.execute() # leave me: nice for profiling.. #from django.db import connection as db_connection, reset_queries #print "=" * 80 #for query in db_connection.queries: # print "%s - %s" % (query['time'], query['sql'][:1000]) #print "-" * 80 #print "took: %f" % (time.time() - start) #print "=" * 80 #reset_queries() return results
def get_results(self, segment=None): key = PollQuestion.POLL_QUESTION_RESULTS_CACHE_KEY % ( self.poll.org.pk, self.poll.pk, self.pk) if segment: substituted_segment = self.poll.org.substitute_segment(segment) key += ":" + slugify(unicode(json.dumps(substituted_segment))) cached_value = cache.get(key, None) if cached_value: return cached_value["results"] org = self.poll.org open_ended = self.is_open_ended() responded = self.get_responded() polled = self.get_polled() results = [] if open_ended and not segment: cursor = connection.cursor() custom_sql = """ SELECT w.label, count(*) AS count FROM (SELECT regexp_split_to_table(LOWER(text), E'[^[:alnum:]_]') AS label FROM polls_pollresult WHERE polls_pollresult.org_id = %d AND polls_pollresult.flow = '%s' AND polls_pollresult.ruleset = '%s') w group by w.label order by count desc; """ % (org.id, self.poll.flow_uuid, self.ruleset_uuid) cursor.execute(custom_sql) from ureport.utils import get_dict_from_cursor unclean_categories = get_dict_from_cursor(cursor) categories = [] ureport_languages = getattr(settings, 'LANGUAGES', [('en', 'English')]) org_languages = [ lang[1].lower() for lang in ureport_languages if lang[0] == org.language ] if 'english' not in org_languages: org_languages.append('english') ignore_words = [] for lang in org_languages: ignore_words += safe_get_stop_words(lang) categories = [] for category in unclean_categories: if len(category['label']) > 1 and category[ 'label'] not in ignore_words and len(categories) < 100: categories.append( dict(label=category['label'], count=int(category['count']))) # sort by count, then alphabetically categories = sorted(categories, key=lambda c: (-c['count'], c['label'])) results.append( dict(open_ended=open_ended, set=responded, unset=polled - responded, categories=categories)) else: categories_label = self.response_categories.filter( is_active=True).values_list('category', flat=True) question_results = self.get_question_results() if segment: location_part = segment.get('location').lower() if location_part not in ['state', 'district']: return None location_boundaries = org.get_segment_org_boundaries(segment) for boundary in location_boundaries: categories = [] osm_id = boundary.get('osm_id').upper() set_count = 0 unset_count_key = "ruleset:%s:nocategory:%s:%s" % ( self.ruleset_uuid, location_part, osm_id) unset_count = question_results.get(unset_count_key, 0) for categorie_label in categories_label: category_count_key = "ruleset:%s:category:%s:%s:%s" % ( self.ruleset_uuid, categorie_label.lower(), location_part, osm_id) category_count = question_results.get( category_count_key, 0) set_count += category_count categories.append( dict(count=category_count, label=categorie_label)) if open_ended: # For home page best and worst location responses from ureport.contacts.models import Contact if segment.get('location') == 'District': boundary_contacts_count = Contact.objects.filter( org=org, district=osm_id).count() else: boundary_contacts_count = Contact.objects.filter( org=org, state=osm_id).count() unset_count = boundary_contacts_count - set_count results.append( dict(open_ended=open_ended, set=set_count, unset=unset_count, boundary=osm_id, label=boundary.get('name'), categories=categories)) else: categories = [] for categorie_label in categories_label: category_count_key = "ruleset:%s:category:%s" % ( self.ruleset_uuid, categorie_label.lower()) if categorie_label.lower() != 'other': category_count = question_results.get( category_count_key, 0) categories.append( dict(count=category_count, label=categorie_label)) results.append( dict(open_ended=open_ended, set=responded, unset=polled - responded, categories=categories)) cache.set(key, {"results": results}, PollQuestion.POLL_QUESTION_RESULTS_CACHE_TIMEOUT) return results
documents = [] for i in range(len(text_files)): # store = open("store.html", 'w') soup = BeautifulSoup(open(text_files[i], encoding="utf8"), "lxml").get_text() # store.write(soup) new = open(text_files[i], 'w', encoding="utf8") new.write(soup) new.close() documents = [open(f, encoding="utf8").read() for f in text_files] print (len(text_files)) #set stop_words stop_words_1 = safe_get_stop_words('unsupported language') + get_stop_words('en') + get_stop_words('english') + ['DOCTYPE', 'html', 'PUBLIC', 'head', 'meta', 'http', 'content', 'link', 'rel', 'href', 'title', 'style', 'type', 'import', 'media', 'script', 'javascript', 'src', 'body', 'div', 'class', 'id', 'name', 'a', 'h3', 'h1', 'h2', 'table', 'tr', 'td', 'p', 'small', 'span', 'b', 'font', 'li', 'articles', 'wikipedia', 'text', 'css', 'org', 'th', 'skins', 'width', 'en', 'wiki', r'\d+\w+'] vectorizer = CountVectorizer() vectorizer = CountVectorizer(stop_words=stop_words_1) #get tf_idf matrix and transform to 2D list tfidf_matrix = vectorizer.fit_transform(documents) #print (tfidf_vectorizer.vocabulary_) b = tfidf_matrix.todense().tolist() a = preprocessing.normalize(b, norm='l2') print ((a[0])) for i in range(len(a)): print (max((a[i]))) #calculate one_norm value and sort documents one_norm = []
# nltk.download('wordnet') # nltk.download('stopwords') warnings.filterwarnings("ignore") sys.stdout = open("./output/disaster_output.txt", "w") plt.style.use('ggplot') nlp = spacy.load('en_core_web_sm') deselect_stop_words = ['no', 'not'] # we don't consider no and not stop words for w in deselect_stop_words: nlp.vocab[w].is_stop = False lemmatizer = WordNetLemmatizer() stop_words = safe_get_stop_words('en') hashtag_regex = re.compile(r"\#\b[\w\-\_]+\b") twitter_segmenter = Segmenter(corpus="twitter_2018") camelcase_regex = re.compile( r'((?<=[a-z])[A-Z]|(?<!^)[A-Z](?=[a-z])|[0-9]+|(?<=[0-9\-\_])[A-Za-z]|[\-\_])' ) # DATA PRE-PROCESSING FUNCTIONS def unescape_tweet(tweet): """Unescaping various chars found in text """ return html.unescape(tweet) def strip_html_tags(text): """remove html tags from text"""
def calculate_results(self, segment=None): org = self.poll.org open_ended = self.is_open_ended() responded = self.get_responded() polled = self.get_polled() results = [] if open_ended and not segment: custom_sql = """ SELECT w.label, count(*) AS count FROM (SELECT regexp_split_to_table(LOWER(text), E'[^[:alnum:]_]') AS label FROM polls_pollresult WHERE polls_pollresult.org_id = %d AND polls_pollresult.flow = '%s' AND polls_pollresult.ruleset = '%s' AND polls_pollresult.text IS NOT NULL AND polls_pollresult.text NOT ILIKE '%s') w group by w.label; """ % (org.id, self.poll.flow_uuid, self.ruleset_uuid, "http%") with connection.cursor() as cursor: cursor.execute(custom_sql) from ureport.utils import get_dict_from_cursor unclean_categories = get_dict_from_cursor(cursor) ureport_languages = getattr(settings, 'LANGUAGES', [('en', 'English')]) org_languages = [lang[1].lower() for lang in ureport_languages if lang[0] == org.language] if 'english' not in org_languages: org_languages.append('english') ignore_words = [] for lang in org_languages: ignore_words += safe_get_stop_words(lang) categories = [] for category in unclean_categories: if len(category['label']) > 1 and category['label'] not in ignore_words and len(categories) < 100: categories.append(dict(label=category['label'], count=int(category['count']))) # sort by count, then alphabetically categories = sorted(categories, key=lambda c: (-c['count'], c['label'])) results.append(dict(open_ended=open_ended, set=responded, unset=polled-responded, categories=categories)) else: categories_label = self.response_categories.filter(is_active=True).values_list('category', flat=True) question_results = self.get_question_results() if segment: location_part = segment.get('location', '').lower() age_part = segment.get('age', '').lower() gender_part = segment.get('gender', '').lower() if location_part in ['state', 'district', 'ward']: location_boundaries = org.get_segment_org_boundaries(segment) for boundary in location_boundaries: categories = [] osm_id = boundary.get('osm_id').upper() set_count = 0 unset_count_key = "ruleset:%s:nocategory:%s:%s" % (self.ruleset_uuid, location_part, osm_id) unset_count = question_results.get(unset_count_key, 0) for categorie_label in categories_label: category_count_key = "ruleset:%s:category:%s:%s:%s" % (self.ruleset_uuid, categorie_label.lower(), location_part, osm_id) category_count = question_results.get(category_count_key, 0) set_count += category_count categories.append(dict(count=category_count, label=categorie_label)) results.append(dict(open_ended=open_ended, set=set_count, unset=unset_count, boundary=osm_id, label=boundary.get('name'), categories=categories)) elif age_part: poll_year = self.poll.poll_date.year born_results = {k: v for k, v in question_results.iteritems() if k[-9:-5] == 'born'} age_intervals = dict() age_intervals['35+'] = (35, 2000) age_intervals['31-34'] = (31, 34) age_intervals['25-30'] = (25, 30) age_intervals['20-24'] = (20, 24) age_intervals['15-19'] = (15, 19) age_intervals['0-14'] = (0, 14) for age_group in age_intervals.keys(): lower_bound, upper_bound = age_intervals[age_group] unset_count = 0 categories_count = dict() for categorie_label in categories_label: if categorie_label.lower() != 'other': categories_count[categorie_label.lower()] = 0 for result_key, result_count in born_results.iteritems(): age = poll_year - int(result_key[-4:]) if lower_bound <= age < upper_bound: if 'nocategory' in result_key: unset_count += result_count for categorie_label in categories_label: if categorie_label.lower() != 'other': if result_key.startswith('ruleset:%s:category:%s:' % (self.ruleset_uuid, categorie_label.lower())): categories_count[categorie_label.lower()] += result_count categories = [dict(count=v, label=k) for k, v in categories_count.iteritems()] set_count = sum([elt['count'] for elt in categories]) results.append(dict(set=set_count, unset=unset_count, label=age_group, categories=categories)) results = sorted(results, key=lambda i:i['label']) elif gender_part: genders = ['f', 'm'] gender_labels = dict(f=_('Female'), m=_('Male')) for gender in genders: categories = [] set_count = 0 unset_count_key = "ruleset:%s:nocategory:%s:%s"% (self.ruleset_uuid, 'gender', gender) unset_count = question_results.get(unset_count_key, 0) for categorie_label in categories_label: category_count_key = "ruleset:%s:category:%s:%s:%s" % (self.ruleset_uuid, categorie_label.lower(), 'gender', gender) if categorie_label.lower() != 'other': category_count = question_results.get(category_count_key, 0) set_count += category_count categories.append(dict(count=category_count, label=categorie_label)) results.append(dict(set=set_count, unset=unset_count, label=gender_labels.get(gender), categories=categories)) else: categories = [] for categorie_label in categories_label: category_count_key = "ruleset:%s:category:%s" % (self.ruleset_uuid, categorie_label.lower()) if categorie_label.lower() != 'other': category_count = question_results.get(category_count_key, 0) categories.append(dict(count=category_count, label=categorie_label)) results.append(dict(open_ended=open_ended, set=responded, unset=polled-responded, categories=categories)) cache_time = PollQuestion.POLL_QUESTION_RESULTS_CACHE_TIMEOUT if not segment: cache_time = None if segment and segment.get('location', '').lower() == 'state': cache_time = None if segment and segment.get('age', '').lower() == 'age': cache_time = None if segment and segment.get('gender', '').lower() == 'gender': cache_time = None key = PollQuestion.POLL_QUESTION_RESULTS_CACHE_KEY % (self.poll.org.pk, self.poll.pk, self.pk) if segment: substituted_segment = self.poll.org.substitute_segment(segment) key += ":" + slugify(unicode(json.dumps(substituted_segment))) cache.set(key, {"results": results}, cache_time) return results
tokenize.pattern = re.compile("\W+") def delete_stop_words(query): """ Удаление стоп-слов :param query: исходный список слов :return: список с удалёнными стоп-словами """ return (token for token in query if token not in delete_stop_words.stop_words_set) delete_stop_words.stop_words_set = stop_words_set = set( safe_get_stop_words("ru") + safe_get_stop_words("en")) def stem(word): """ Стемминг слова :param word: исходное слово :return: основа слова """ return min(stem.en.stemWord(word), stem.ru.stemWord(word), key=lambda x: len(x)) stem.ru = stemmer('russian') stem.en = stemmer('english')
from titlecase import titlecase from unidecode import unidecode from yarl import URL env = Env(GITHUB_USERNAME=str, GITHUB_TOKEN=str, PINBOARD_TOKEN=str) GITHUB_TOKEN = env.str("GITHUB_TOKEN") GITHUB_USERNAME = env.str("GITHUB_USERNAME") PINBOARD_TOKEN = env.str("PINBOARD_TOKEN") IGNORE_WORDS = set( [word.lower() for word in Path("IGNORE_WORDS.txt").read_text().split()]) STOP_WORDS = set( [word.lower() for word in Path("STOP_WORDS.txt").read_text().split()]) STOP_WORDS.update(set(safe_get_stop_words("english"))) IGNORE_TAGS = IGNORE_WORDS | STOP_WORDS def get_dev_to_info_for_url(url): try: req = requests.get(url, timeout=1.0) soup = BeautifulSoup(req.text, "html.parser") data = { "tags": [ tag.text.lstrip("#") for tag in soup.find_all("a", {"class": "tag"}) ] } return data
def calculate_results(self, segment=None): org = self.poll.org open_ended = self.is_open_ended() responded = self.get_responded() polled = self.get_polled() results = [] if open_ended and not segment: custom_sql = """ SELECT w.label, count(*) AS count FROM (SELECT regexp_split_to_table(LOWER(text), E'[^[:alnum:]_]') AS label FROM polls_pollresult WHERE polls_pollresult.org_id = %d AND polls_pollresult.flow = '%s' AND polls_pollresult.ruleset = '%s' AND polls_pollresult.text IS NOT NULL AND polls_pollresult.text NOT ILIKE '%s') w group by w.label; """ % (org.id, self.poll.flow_uuid, self.ruleset_uuid, "http%") with connection.cursor() as cursor: cursor.execute(custom_sql) from ureport.utils import get_dict_from_cursor unclean_categories = get_dict_from_cursor(cursor) ureport_languages = getattr(settings, 'LANGUAGES', [('en', 'English')]) org_languages = [ lang[1].lower() for lang in ureport_languages if lang[0] == org.language ] if 'english' not in org_languages: org_languages.append('english') ignore_words = [] for lang in org_languages: ignore_words += safe_get_stop_words(lang) categories = [] # sort by count, then alphabetically unclean_categories = sorted(unclean_categories, key=lambda c: (-c['count'], c['label'])) for category in unclean_categories: if len(category['label']) > 1 and category[ 'label'] not in ignore_words and len(categories) < 100: categories.append( dict(label=category['label'], count=int(category['count']))) results.append( dict(open_ended=open_ended, set=responded, unset=polled - responded, categories=categories)) else: categories_label = self.response_categories.filter( is_active=True).values_list('category', flat=True) question_results = self.get_question_results() if segment: location_part = segment.get('location', '').lower() age_part = segment.get('age', '').lower() gender_part = segment.get('gender', '').lower() if location_part in ['state', 'district', 'ward']: location_boundaries = org.get_segment_org_boundaries( segment) for boundary in location_boundaries: categories = [] osm_id = boundary.get('osm_id').upper() set_count = 0 unset_count_key = "ruleset:%s:nocategory:%s:%s" % ( self.ruleset_uuid, location_part, osm_id) unset_count = question_results.get(unset_count_key, 0) for categorie_label in categories_label: if categorie_label.lower( ) not in PollResponseCategory.IGNORED_CATEGORY_RULES: category_count_key = "ruleset:%s:category:%s:%s:%s" % ( self.ruleset_uuid, categorie_label.lower(), location_part, osm_id) category_count = question_results.get( category_count_key, 0) set_count += category_count categories.append( dict(count=category_count, label=categorie_label)) results.append( dict(open_ended=open_ended, set=set_count, unset=unset_count, boundary=osm_id, label=boundary.get('name'), categories=categories)) elif age_part: poll_year = self.poll.poll_date.year born_results = { k: v for k, v in question_results.iteritems() if k[-9:-5] == 'born' } age_intervals = dict() age_intervals['35+'] = (35, 2000) age_intervals['31-34'] = (31, 34) age_intervals['25-30'] = (25, 30) age_intervals['20-24'] = (20, 24) age_intervals['15-19'] = (15, 19) age_intervals['0-14'] = (0, 14) for age_group in age_intervals.keys(): lower_bound, upper_bound = age_intervals[age_group] unset_count = 0 categories_count = dict() for categorie_label in categories_label: if categorie_label.lower( ) not in PollResponseCategory.IGNORED_CATEGORY_RULES: categories_count[categorie_label.lower()] = 0 for result_key, result_count in born_results.iteritems( ): age = poll_year - int(result_key[-4:]) if lower_bound <= age < upper_bound: if 'nocategory' in result_key: unset_count += result_count for categorie_label in categories_label: if categorie_label.lower( ) not in PollResponseCategory.IGNORED_CATEGORY_RULES: if result_key.startswith( 'ruleset:%s:category:%s:' % (self.ruleset_uuid, categorie_label.lower())): categories_count[ categorie_label.lower( )] += result_count categories = [ dict(count=v, label=k) for k, v in categories_count.iteritems() ] set_count = sum([elt['count'] for elt in categories]) results.append( dict(set=set_count, unset=unset_count, label=age_group, categories=categories)) results = sorted(results, key=lambda i: i['label']) elif gender_part: genders = ['f', 'm'] gender_labels = dict(f=_('Female'), m=_('Male')) for gender in genders: categories = [] set_count = 0 unset_count_key = "ruleset:%s:nocategory:%s:%s" % ( self.ruleset_uuid, 'gender', gender) unset_count = question_results.get(unset_count_key, 0) for categorie_label in categories_label: category_count_key = "ruleset:%s:category:%s:%s:%s" % ( self.ruleset_uuid, categorie_label.lower(), 'gender', gender) if categorie_label.lower( ) not in PollResponseCategory.IGNORED_CATEGORY_RULES: category_count = question_results.get( category_count_key, 0) set_count += category_count categories.append( dict(count=category_count, label=categorie_label)) results.append( dict(set=set_count, unset=unset_count, label=gender_labels.get(gender), categories=categories)) else: categories = [] for categorie_label in categories_label: category_count_key = "ruleset:%s:category:%s" % ( self.ruleset_uuid, categorie_label.lower()) if categorie_label.lower( ) not in PollResponseCategory.IGNORED_CATEGORY_RULES: category_count = question_results.get( category_count_key, 0) categories.append( dict(count=category_count, label=categorie_label)) results.append( dict(open_ended=open_ended, set=responded, unset=polled - responded, categories=categories)) cache_time = PollQuestion.POLL_QUESTION_RESULTS_CACHE_TIMEOUT if not segment: cache_time = None if segment and segment.get('location', '').lower() == 'state': cache_time = None if segment and segment.get('age', '').lower() == 'age': cache_time = None if segment and segment.get('gender', '').lower() == 'gender': cache_time = None key = PollQuestion.POLL_QUESTION_RESULTS_CACHE_KEY % ( self.poll.org.pk, self.poll.pk, self.pk) if segment: substituted_segment = self.poll.org.substitute_segment(segment) key += ":" + slugify(unicode(json.dumps(substituted_segment))) cache.set(key, {"results": results}, cache_time) return results
def get_value_summary(cls, ruleset=None, contact_field=None, filters=None, segment=None): """ Returns the results for the passed in ruleset or contact field given the passed in filters and segments. Filters are expected in the following formats: { field: rulesetId, categories: ["Red", "Blue", "Yellow"] } Segments are expected in these formats instead: { ruleset: 1515, categories: ["Red", "Blue"] } // segmenting by another field, for those categories { groups: 124,151,151 } // segment by each each group in the passed in ids { location: "State", parent: null } // segment for each admin boundary within the parent { contact_field: "Country", values: ["US", "EN", "RW"] } // segment by a contact field for these values """ from temba.contacts.models import ContactGroup, ContactField from temba.flows.models import TrueTest, RuleSet # start = time.time() results = [] if (not ruleset and not contact_field) or ( ruleset and contact_field): # pragma: needs cover raise ValueError("Must specify either a RuleSet or Contact field.") org = ruleset.flow.org if ruleset else contact_field.org open_ended = ruleset and ruleset.ruleset_type == RuleSet.TYPE_WAIT_MESSAGE and len( ruleset.get_rules()) == 1 # default our filters to an empty list if None are passed in if filters is None: filters = [] # build the kwargs for our subcall kwargs = dict(ruleset=ruleset, contact_field=contact_field, filters=filters) # this is our list of dependencies, that is things that will blow away our results dependencies = set() fingerprint_dict = dict(filters=filters, segment=segment) if ruleset: fingerprint_dict['ruleset'] = ruleset.id dependencies.add(RULESET_KEY % ruleset.id) if contact_field: fingerprint_dict['contact_field'] = contact_field.id dependencies.add(CONTACT_KEY % contact_field.id) for contact_filter in filters: if 'ruleset' in contact_filter: dependencies.add(RULESET_KEY % contact_filter['ruleset']) if 'groups' in contact_filter: for group_id in contact_filter['groups']: dependencies.add(GROUP_KEY % group_id) if 'location' in contact_filter: # pragma: needs cover field = ContactField.get_by_label(org, contact_filter['location']) dependencies.add(CONTACT_KEY % field.id) if segment: if 'ruleset' in segment: dependencies.add(RULESET_KEY % segment['ruleset']) if 'groups' in segment: # pragma: needs cover for group_id in segment['groups']: dependencies.add(GROUP_KEY % group_id) if 'location' in segment: field = ContactField.get_by_label(org, segment['location']) dependencies.add(CONTACT_KEY % field.id) # our final redis key will contain each dependency as well as a HASH representing the fingerprint of the # kwargs passed to this method, generate that hash fingerprint = hash(dict_to_json(fingerprint_dict)) # generate our key key = VALUE_SUMMARY_CACHE_KEY + ":" + str(org.id) + ":".join( sorted(list(dependencies))) + ":" + str(fingerprint) # does our value exist? r = get_redis_connection() cached = r.get(key) if cached is not None: try: return json_to_dict(cached) except Exception: # pragma: needs cover # failed decoding, oh well, go calculate it instead pass if segment: # segmenting a result is the same as calculating the result with the addition of each # category as a filter so we expand upon the passed in filters to do this if 'ruleset' in segment and 'categories' in segment: for category in segment['categories']: category_filter = list(filters) category_filter.append( dict(ruleset=segment['ruleset'], categories=[category])) # calculate our results for this segment kwargs['filters'] = category_filter (set_count, unset_count, categories) = cls.get_filtered_value_summary(**kwargs) results.append( dict(label=category, open_ended=open_ended, set=set_count, unset=unset_count, categories=categories)) # segmenting by groups instead, same principle but we add group filters elif 'groups' in segment: # pragma: needs cover for group_id in segment['groups']: # load our group group = ContactGroup.user_groups.get(org=org, pk=group_id) category_filter = list(filters) category_filter.append(dict(groups=[group_id])) # calculate our results for this segment kwargs['filters'] = category_filter (set_count, unset_count, categories) = cls.get_filtered_value_summary(**kwargs) results.append( dict(label=group.name, open_ended=open_ended, set=set_count, unset_count=unset_count, categories=categories)) # segmenting by a contact field, only for passed in categories elif 'contact_field' in segment and 'values' in segment: # look up the contact field field = ContactField.get_by_label(org, segment['contact_field']) for value in segment['values']: value_filter = list(filters) value_filter.append( dict(contact_field=field.pk, values=[value])) # calculate our results for this segment kwargs['filters'] = value_filter (set_count, unset_count, categories) = cls.get_filtered_value_summary(**kwargs) results.append( dict(label=value, open_ended=open_ended, set=set_count, unset=unset_count, categories=categories)) # segmenting by a location field elif 'location' in segment: # look up the contact field field = ContactField.get_by_label(org, segment['location']) # make sure they are segmenting on a location type that makes sense if field.value_type not in [ Value.TYPE_STATE, Value.TYPE_DISTRICT, Value.TYPE_WARD ]: # pragma: needs cover raise ValueError( _("Cannot segment on location for field that is not a State or District type" )) # make sure our org has a country for location based responses if not org.country: # pragma: needs cover raise ValueError( _("Cannot segment by location until country has been selected for organization" )) # the boundaries we will segment by parent = org.country # figure out our parent parent_osm_id = segment.get('parent', None) if parent_osm_id: parent = AdminBoundary.objects.get(osm_id=parent_osm_id) # get all the boundaries we are segmenting on boundaries = list( AdminBoundary.objects.filter( parent=parent).order_by('name')) # if the field is a district field, they need to specify the parent state if not parent_osm_id and field.value_type == Value.TYPE_DISTRICT: # pragma: needs cover raise ValueError( _("You must specify a parent state to segment results by district" )) if not parent_osm_id and field.value_type == Value.TYPE_WARD: # pragma: needs cover raise ValueError( _("You must specify a parent state to segment results by ward" )) # if this is a district, we can speed things up by only including those districts in our parent, build # the filter for that if parent and field.value_type in [ Value.TYPE_DISTRICT, Value.TYPE_WARD ]: location_filters = [ filters, dict(location=field.pk, boundary=[b.osm_id for b in boundaries]) ] else: location_filters = filters # get all the contacts segment by location first (location_set_contacts, location_unset_contacts, location_results) = \ cls.get_filtered_value_summary(contact_field=field, filters=location_filters, return_contacts=True) # now get the contacts for our primary query kwargs['return_contacts'] = True kwargs['filter_contacts'] = location_set_contacts (primary_set_contacts, primary_unset_contacts, primary_results) = cls.get_filtered_value_summary(**kwargs) # build a map of osm_id to location_result osm_results = {lr['label']: lr for lr in location_results} empty_result = dict(contacts=list()) for boundary in boundaries: location_result = osm_results.get(boundary.osm_id, empty_result) # clone our primary results segmented_results = dict(label=boundary.name, boundary=boundary.osm_id, open_ended=open_ended) location_categories = list() location_contacts = set(location_result['contacts']) for category in primary_results: category_contacts = set(category['contacts']) intersection = location_contacts & category_contacts location_categories.append( dict(label=category['label'], count=len(intersection))) segmented_results['set'] = len(location_contacts & primary_set_contacts) segmented_results['unset'] = len(location_contacts & primary_unset_contacts) segmented_results['categories'] = location_categories results.append(segmented_results) results = sorted(results, key=lambda r: r['label']) else: (set_count, unset_count, categories) = cls.get_filtered_value_summary(**kwargs) # Check we have and we have an OPEN ENDED ruleset if ruleset and len(ruleset.get_rules()) == 1 and isinstance( ruleset.get_rules()[0].test, TrueTest): cursor = connection.cursor() custom_sql = """SELECT w.label, count(*) AS count FROM ( SELECT regexp_split_to_table(LOWER(text), E'[^[:alnum:]_]') AS label FROM msgs_msg INNER JOIN contacts_contact ON ( msgs_msg.contact_id = contacts_contact.id ) WHERE msgs_msg.id IN ( SELECT msg_id FROM flows_flowstep_messages, flows_flowstep WHERE flowstep_id = flows_flowstep.id AND flows_flowstep.step_uuid = '%s' ) AND contacts_contact.is_test = False ) w group by w.label order by count desc;""" % ruleset.uuid cursor.execute(custom_sql) unclean_categories = get_dict_from_cursor(cursor) categories = [] org_languages = [ lang.name.lower() for lang in org.languages.filter(orgs=None).distinct() ] if 'english' not in org_languages: org_languages.append('english') ignore_words = [] for lang in org_languages: ignore_words += safe_get_stop_words(lang) for category in unclean_categories: if len(category['label']) > 1 and category[ 'label'] not in ignore_words and len( categories) < 100: categories.append( dict(label=category['label'], count=int(category['count']))) # sort by count, then alphabetically categories = sorted(categories, key=lambda c: (-c['count'], c['label'])) results.append( dict(label=six.text_type(_("All")), open_ended=open_ended, set=set_count, unset=unset_count, categories=categories)) # for each of our dependencies, add our key as something that depends on it pipe = r.pipeline() for dependency in dependencies: pipe.sadd(dependency, key) pipe.expire(dependency, VALUE_SUMMARY_CACHE_TIME) # and finally set our result pipe.set(key, dict_to_json(results), VALUE_SUMMARY_CACHE_TIME) pipe.execute() # leave me: nice for profiling.. # from django.db import connection as db_connection, reset_queries # print "=" * 80 # for query in db_connection.queries: # print "%s - %s" % (query['time'], query['sql'][:1000]) # print "-" * 80 # print "took: %f" % (time.time() - start) # print "=" * 80 # reset_queries() return results
from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import * import gensim from collections import defaultdict from stop_words import safe_get_stop_words from sklearn.model_selection import train_test_split from termcolor import colored stop_words = safe_get_stop_words('russian') data = pd.read_excel('reviews.xlsx') data.columns = ['rate', 'text'] data.text = data.text.str.lower() data.rate.replace(to_replace=[1, 2], value=-1, inplace=True) data.rate.replace(to_replace=[3], value=0, inplace=True) data.rate.replace(to_replace=[4, 5], value=1, inplace=True) y = data.rate x = data.text.values cvec = CountVectorizer(stop_words=stop_words) x_cvec = cvec.fit_transform(x)
if args.w is None: args.w = "english" if args.min is None: args.min = 0 if args.boost is None: args.boost = 1 if args.blow is None: args.blow = 1 stopwords = set(STOPWORDS) if args.l is not None: for language in args.l: stopwords.update(safe_get_stop_words(language.lower())) mask = None colors = None if args.m is not None: print("Creating mask...", end=" ", flush=True) mask = np.array(Image.open(args.m).convert("RGB")) colors = ImageColorGenerator(mask) print("Done!") cw = 0 cc = None if args.c is not None: cw = int(args.c[0]) cc = args.c[1]
def remove_stop_words(string, language): tokens = string.split() clean_tokens = [token for token in tokens if token not in safe_get_stop_words(language)] return u' '.join(clean_tokens)