Пример #1
0
def tag_frequency():
    """ Reads the BÍN corpus and counts instances of each tag (translated to
    IceNLP format) found in the corpus. Returns a tag=>count mapping.
    """
    d = defaultdict(int)
    for entry in read_bin():
        tag = translate_tag(CATEGORY_MAP[entry.flokkur],entry.flokkur,entry.hluti,entry.greining)
        d[(tag,entry.flokkur,entry.hluti,entry.greining)] += 1
    return d
Пример #2
0
def tag_frequency():
    """ Reads the BÍN corpus and counts instances of each tag (translated to
    IceNLP format) found in the corpus. Returns a tag=>count mapping.
    """
    d = defaultdict(int)
    for entry in read_bin():
        tag = translate_tag(CATEGORY_MAP[entry.flokkur], entry.flokkur,
                            entry.hluti, entry.greining)
        d[(tag, entry.flokkur, entry.hluti, entry.greining)] += 1
    return d
Пример #3
0
    def _process_raw(self):
        suffix_tmp = {}

        #load OTB
        otb = {}
        adverbs = []
        for word, tag, count in corpustools.read_otb():
            otb[word] = count
            #pluck out any adverbs
            if tag[0]=='a': adverbs.append((word,tag,count))

        #load BIN
        lemma_id = 0
        for entries in corpustools.read_bin_grouped(filter=True):
            count = 0
            category = CATEGORY_MAP[entries[0].flokkur]
            lemma = None

            coded_entries = []
            for entry in entries:
                count   += otb.get(entry.ordmynd, 0)

                #encode/preprocess entries
                tag   = icepy_encode(
                            translate_tag(category,entry.flokkur,entry.hluti,entry.greining)
                        )
                #add proper noun marker to tag
                if tag[0]=='n' and entry.lemma[0].isupper() and '-' not in tag:
                    if tag[-1]=='g':
                        tag += 's'
                    else:
                        tag += '-s'

                if not lemma: lemma = icepy_encode(entry.lemma.lower())
                word  = icepy_encode(entry.ordmynd.lower())
                
                self.tag_count[tag] += 1
                coded_entries.append((word,tag))

            lemma_id += 1

            self.id_lemma[lemma_id] = (lemma, category, count)
            self._prefix_fill(lemma_id,coded_entries,suffix_tmp)

        #inject morphemes
        for lemma,entries in corpustools.read_morphemes_grouped():
            count = 0 #currently no count info available for morphemes
            category = 'm'
            lemma = icepy_encode(lemma)
            entries = [icepy_encode(e) for e in entries]

            for word,tag in entries:
                self.tag_count[tag] += 1

            lemma_id += 1
            self.id_lemma[lemma_id] = (lemma, category, count)
            self._prefix_fill(lemma_id,entries,suffix_tmp)

        #inject adverb tags from OTB
        for word,tag,count in adverbs:
            tag = icepy_encode(tag)
            frozenmap = (('', (tag,)),)
            self.tag_count[tag] += 1    
            if frozenmap in suffix_tmp:
                suffix_id = suffix_tmp[frozenmap]
            else:
                suffix_id = len(suffix_tmp)
                suffix_tmp[frozenmap] = suffix_id

        #reverse suffix and tag maps
        for suffixes,suffix_id in suffix_tmp.iteritems():
            self.id_suffixes[suffix_id] = dict(suffixes)

        #inject adverbs from OTB, if they are not already in the maps
        for word,tag,count in adverbs:
            if not self._lookup_candidates(word,tag=tag):
                word = icepy_encode(word)
                lemma_id += 1
                self.id_lemma[lemma_id] = (word, 'a', count)

                frozenmap = (('', (icepy_encode(tag),)),)
                suffix_id = suffix_tmp[frozenmap]

                self.prefix_map[word].append( (lemma_id, suffix_id, 1) )

        #generate bloom filter
        self._generate_bloom()