Пример #1
0
def generate_idf_model():
    terms = {ALL: defaultdict(int)}

    print "Extracting..."
    counts = {ALL: 0}

    for party, sections in load_platforms().items():
        counts[party] = 0
        terms[party] = defaultdict(int)

        for section in sections:
            counts[party] += 1
            counts[ALL] += 1

            for token in section.tokens:
                terms[party][token] += 1
                terms[ALL][token] += 1

    print "Calculating IDF..."
    data = {}
    for party, tokens in terms.items():
        num = float(counts[party])
        data[party] = {}

        for term, count in tokens.items():
            idf = math.log((num / (1 + count)))
            data[party][term] = idf

    with open("data/idf.json", "wb") as fh:
        json.dump(data, fh)
Пример #2
0
def classify():
    decisive = map(norm, open('decisive.txt', 'rb').readlines())
    loriot = list(tokenize(open('loriot.txt', 'rb').read().decode('utf-8')))
    #print decisive
    #return
    platforms = load_platforms()
    scores = defaultdict(dict)
    for party, sections in platforms.items():
        for section in sections:
            scores[party][section.key] = {'tokens': len(section)}
            text = normalize(section.text)
            n_decisive = 0.0
            for phrase in decisive:
                if phrase in text:
                    n_decisive += 1
            scores[party][section.key]['decisive'] = n_decisive/len(section)
            n_loriot = 0.0
            for token in loriot:
                if token in text:
                    n_loriot += 1
            scores[party][section.key]['loriot'] = n_loriot/len(section)
            #terms = section_terms(model, section)
            #terms = [(t, s) for t, s in terms]
            #print [party, section.title, [t for t, s in terms[:10]]]
    #pprint(scores)
    with open('data/language.json', 'wb') as fh:
        json.dump(dict(scores), fh, indent=2)
Пример #3
0
def test():
    platforms = load_platforms()
    for party, sections in platforms.items():
        for section in sections:
            tagged = split(parse(section.text))
            for sentence in tagged:
                #if not sentence.is_question:
                #    continue
                try:
#                    for word in sentence.words:
#                        print word.tags #dir(word)
                    #print [sentence.subjects, sentence.verbs]
                    #print [sentence.is_question]
                    #print [sentence.words]
                    print [sentence.text]
                except UnicodeEncodeError:
                    pass
Пример #4
0
def save_sentences():
    platforms = load_platforms()
    lengths = defaultdict(list)
    sentences.update({'valid': False}, {})
    for party, sections in platforms.items():
        for section in sections:
            for i, sentence in enumerate(split_sentences(section)):
                lengths[party].append(len(sentence.split()))
                data = {
                    'num': i,
                    'hash': hashlib.sha1(sentence.encode('ascii', 'replace')).hexdigest(),
                    'text': sentence,
                    'party': party,
                    'section': section.key
                }
                #if not check_valid(sentence):
                data['valid'] = check_valid(sentence)
                sentences.upsert(data, ['num', 'section'])

    for party, sens in lengths.items():
        avg = sum(sens) / len(sens)
        print 'PARTY', party, 'AVG', avg
Пример #5
0
    values = {}
    with open('data/values.txt', 'rb') as fh:
        for line in fh:
            if line.strip().startswith('#'):
                continue
            label, terms = line.split(':')
            terms = [t.strip() for t in terms.split(',')]
            for term in terms:
                values[term] = label
    pprint(values)
    return values

if __name__ == '__main__':
    #model = load_idf_model()
    values = ['gerecht*', '*gerechtigkeit*', 'chancen*']
    platforms = load_platforms()
    lengths = {}
    scores = defaultdict(lambda: defaultdict(int))
    for party, sections in platforms.items():
        lengths[party] = sum([len(s) for s in sections])
        token_offset = 0
        for section in sections:
            for token in section.tokens:
                for value in values:
                    vr = value.replace('*', '')
                    if value.endswith('*') and token.startswith(vr):
                        scores[party][value] += 1
                    elif value.startswith('*') and token.endswith(vr):
                        scores[party][value] += 1
                    elif vr == token:
                        scores[party][value] += 1
Пример #6
0
from sections import load_platforms
from collections import defaultdict
from pprint import pprint
import json

SKIP_TOPIC = "intro"


if __name__ == "__main__":
    result = {}
    data = load_platforms()
    for party, sections in data.items():
        total_tokens = sum([len(s.tokens) for s in sections if s.topic != SKIP_TOPIC])
        topics = defaultdict(float)
        for section in sections:
            if section.topic == SKIP_TOPIC:
                continue
            # print [section.topic, section.title]
            # print [len(section.tokens), (len(section.tokens)/float(total_tokens))*100]
            topics[section.topic] += (len(section.tokens) / float(total_tokens)) * 100
        pprint(dict(topics))
        print "TOTAL", sum(topics.values())
        result[party] = dict(topics)
    with open("data/topic_shares.json", "wb") as fh:
        json.dump(result, fh)