Пример #1
0
def analyze_statements(statements, corpus_name):
    results = []
    ngram_lengths = [{
        'length': 3,
        'max_count': 3,
    }, {
        'length': 2,
        'max_count': 8,
    }, {
        'length': 1,
        'max_count': 20,
    }]
    if sum(s.wordcount for s in statements) < 1000:
        return None
    seen = set(STOPWORDS)
    for opts in ngram_lengths:
        bg = load_background_model(corpus_name, opts['length'])
        model = FrequencyModel.from_statement_qs(statements,
                                                 opts['length']).diff(bg)
        top = iter(model.most_common(50))
        count = 0
        for item in top:
            if count >= opts['max_count']:
                continue
            words = item[0].split(' ')
            #if sum(word in seen for word in words) / float(len(words)) < 0.6:
            if words[0] not in seen and words[-1] not in seen:
                seen.update(words)
                results.append({
                    "text": item[0],
                    "score": item[1] * 1000
                    #"size": _log_scale(item[1], opts['range'])
                })
    #results.sort(key=lambda r: r['size'], reverse=True)
    return results
Пример #2
0
def generate_background_models(corpus_name,
                               statements,
                               ngram_lengths=[1, 2, 3]):
    for n in ngram_lengths:
        bg = FrequencyModel.from_statement_qs(statements,
                                              ngram=n,
                                              min_count=5 if n < 3 else 3)
        with open(_get_background_model_path(corpus_name, n), 'wb') as f:
            pickle.dump(bg, f, pickle.HIGHEST_PROTOCOL)
def analyze_statements(statements, corpus_name):
    results = []
    ngram_lengths = [
        {
            'length': 3,
            'max_count': 3,
        },
        {
            'length': 2,
            'max_count': 8,
        },
        {
            'length': 1,
            'max_count': 20,
        }
    ]
    if sum(s.wordcount for s in statements) < 1000:
        return None
    seen = set(STOPWORDS)
    for opts in ngram_lengths:
        bg = load_background_model(corpus_name, opts['length'])
        model = FrequencyModel.from_statement_qs(statements, opts['length']).diff(bg)
        top = iter(model.most_common(50))
        count = 0
        for item in top:
            if count >= opts['max_count']:
                continue
            words = item[0].split(' ')
            #if sum(word in seen for word in words) / float(len(words)) < 0.6:
            if words[0] not in seen and words[-1] not in seen:
                seen.update(words)
                results.append({
                    "text": item[0],
                    "score": item[1] * 1000
                    #"size": _log_scale(item[1], opts['range'])
                })
    #results.sort(key=lambda r: r['size'], reverse=True)
    return results
def generate_background_models(corpus_name, statements, ngram_lengths=[1,2,3]):
    for n in ngram_lengths:
        bg = FrequencyModel.from_statement_qs(statements, ngram=n, min_count=5 if n < 3 else 3)
        with open(_get_background_model_path(corpus_name, n), 'wb') as f:
            pickle.dump(bg, f, pickle.HIGHEST_PROTOCOL)