示例#1
0
    def parse(self, log: ProcessLogger, text, text_unit_id, _text_unit_lang,
              document_initial_load: bool = False, **kwargs) -> ParseResults:
        project_id = kwargs.get('project_id')
        term_stems = dict_data_cache.get_term_config(project_id)
        text_stems = ' %s ' % ' '.join(get_stems(text, lowercase=True))
        text_tokens = get_token_list(text, lowercase=True)
        term_usages = []
        for stemmed_term, data in term_stems.items():
            # stem not found in text
            if stemmed_term not in text_stems:
                continue
            # if stem has 1 variant only
            if data['length'] == 1:
                count = text_stems.count(stemmed_term)
                if count:
                    term_data = list(data['values'][0])
                    term_data.append(count)
                    term_usages.append(term_data)
            # case when f.e. stem "respons" is equal to multiple terms
            # ["response", "responsive", "responsibility"]
            else:
                for term_data in data['values']:
                    term_data = list(term_data)
                    count = text_tokens.count(term_data[0])
                    if count:
                        term_data.append(count)
                        term_usages.append(term_data)
                        # TODO: "responsibilities"

        return ParseResults({TermUsage: [
            TermUsage(text_unit_id=text_unit_id,
                      term_id=pk,
                      count=count) for _, pk, count in term_usages]})
def get_stem_distribution(text: str, lowercase=False, stopword=False) -> Dict[str, int]:
    """
    Get stemmed token distribution of text, potentially lowercasing and stopwording first.
    """
    # Iterate through token stems
    tokens = list(get_stems(text, lowercase=lowercase, stopword=stopword))

    # Calculate distribution
    token_distribution = dict([(t, tokens.count(t)) for t in set(tokens)])
    return token_distribution
示例#3
0
 def cache_term_stems(*args, **kwargs):
     term_stems = {}
     for t, pk in Term.objects.values_list('term', 'pk'):
         stemmed_term = ' %s ' % ' '.join(get_stems(t))
         stemmed_item = term_stems.get(stemmed_term, [])
         stemmed_item.append([t, pk])
         term_stems[stemmed_term] = stemmed_item
     for item in term_stems:
         term_stems[item] = dict(values=term_stems[item],
                                 length=len(term_stems[item]))
     DbCache.put_to_db(CACHE_KEY_TERM_STEMS, term_stems)
def cache_term_stems(project_id=None):
    term_stems = {}

    terms_qs = Term.objects
    key = CACHE_KEY_TERM_STEMS

    if project_id is not None:
        qs = ProjectTermConfiguration.objects.filter(project_id=project_id)
        if qs.exists():
            terms_qs = qs.last().terms
            key = CACHE_KEY_TERM_STEMS_PROJECT_PTN.format(project_id)

    for t, pk in terms_qs.values_list('term', 'pk'):
        stemmed_term = ' %s ' % ' '.join(get_stems(t))
        stemmed_item = term_stems.get(stemmed_term, [])
        stemmed_item.append([t, pk])
        term_stems[stemmed_term] = stemmed_item
    for item in term_stems:
        term_stems[item] = dict(values=term_stems[item],
                                length=len(term_stems[item]))
    DbCache.put_to_db(key, term_stems)
示例#5
0
def get_similar_to_terms_employee(text, positives, negatives):
    """
    Use Employment Agreement W2V to get terms similar
    to those provided and search text for those
    """
    stems = get_stems(text)
    positive_found = False
    negative_found = False
    dir_path = os.path.dirname(os.path.realpath(__file__))

    for p in positives:
        if p in stems:
            positive_found = True
    if positive_found:
        for n in negatives:
            if n in stems:
                negative_found = True
    if positive_found and not negative_found:
        return 1

    w2v_model = gensim.models.word2vec.Word2Vec.load(
        os.path.normpath(
            os.path.join(dir_path,
                         "data/w2v_cbow_employment_size200_window10")))
    trained_similar_words = w2v_model.wv.most_similar(positive=positives,
                                                      negative=negatives)

    trained_similar_words = dict(trained_similar_words)

    sum_similarity = 0
    num_similars = 0
    for i in stems:
        if trained_similar_words.get(i) is not None:
            sum_similarity = sum_similarity + trained_similar_words[i]
            num_similars += 1
    if num_similars is not 0:
        return sum_similarity / num_similars
    else:
        return 0