def parse(self, log: ProcessLogger, text, text_unit_id, _text_unit_lang, document_initial_load: bool = False, **kwargs) -> ParseResults: project_id = kwargs.get('project_id') term_stems = dict_data_cache.get_term_config(project_id) text_stems = ' %s ' % ' '.join(get_stems(text, lowercase=True)) text_tokens = get_token_list(text, lowercase=True) term_usages = [] for stemmed_term, data in term_stems.items(): # stem not found in text if stemmed_term not in text_stems: continue # if stem has 1 variant only if data['length'] == 1: count = text_stems.count(stemmed_term) if count: term_data = list(data['values'][0]) term_data.append(count) term_usages.append(term_data) # case when f.e. stem "respons" is equal to multiple terms # ["response", "responsive", "responsibility"] else: for term_data in data['values']: term_data = list(term_data) count = text_tokens.count(term_data[0]) if count: term_data.append(count) term_usages.append(term_data) # TODO: "responsibilities" return ParseResults({TermUsage: [ TermUsage(text_unit_id=text_unit_id, term_id=pk, count=count) for _, pk, count in term_usages]})
def get_stem_distribution(text: str, lowercase=False, stopword=False) -> Dict[str, int]: """ Get stemmed token distribution of text, potentially lowercasing and stopwording first. """ # Iterate through token stems tokens = list(get_stems(text, lowercase=lowercase, stopword=stopword)) # Calculate distribution token_distribution = dict([(t, tokens.count(t)) for t in set(tokens)]) return token_distribution
def cache_term_stems(*args, **kwargs): term_stems = {} for t, pk in Term.objects.values_list('term', 'pk'): stemmed_term = ' %s ' % ' '.join(get_stems(t)) stemmed_item = term_stems.get(stemmed_term, []) stemmed_item.append([t, pk]) term_stems[stemmed_term] = stemmed_item for item in term_stems: term_stems[item] = dict(values=term_stems[item], length=len(term_stems[item])) DbCache.put_to_db(CACHE_KEY_TERM_STEMS, term_stems)
def cache_term_stems(project_id=None): term_stems = {} terms_qs = Term.objects key = CACHE_KEY_TERM_STEMS if project_id is not None: qs = ProjectTermConfiguration.objects.filter(project_id=project_id) if qs.exists(): terms_qs = qs.last().terms key = CACHE_KEY_TERM_STEMS_PROJECT_PTN.format(project_id) for t, pk in terms_qs.values_list('term', 'pk'): stemmed_term = ' %s ' % ' '.join(get_stems(t)) stemmed_item = term_stems.get(stemmed_term, []) stemmed_item.append([t, pk]) term_stems[stemmed_term] = stemmed_item for item in term_stems: term_stems[item] = dict(values=term_stems[item], length=len(term_stems[item])) DbCache.put_to_db(key, term_stems)
def get_similar_to_terms_employee(text, positives, negatives): """ Use Employment Agreement W2V to get terms similar to those provided and search text for those """ stems = get_stems(text) positive_found = False negative_found = False dir_path = os.path.dirname(os.path.realpath(__file__)) for p in positives: if p in stems: positive_found = True if positive_found: for n in negatives: if n in stems: negative_found = True if positive_found and not negative_found: return 1 w2v_model = gensim.models.word2vec.Word2Vec.load( os.path.normpath( os.path.join(dir_path, "data/w2v_cbow_employment_size200_window10"))) trained_similar_words = w2v_model.wv.most_similar(positive=positives, negative=negatives) trained_similar_words = dict(trained_similar_words) sum_similarity = 0 num_similars = 0 for i in stems: if trained_similar_words.get(i) is not None: sum_similarity = sum_similarity + trained_similar_words[i] num_similars += 1 if num_similars is not 0: return sum_similarity / num_similars else: return 0