def tag_frequency(): """ Reads the BÍN corpus and counts instances of each tag (translated to IceNLP format) found in the corpus. Returns a tag=>count mapping. """ d = defaultdict(int) for entry in read_bin(): tag = translate_tag(CATEGORY_MAP[entry.flokkur],entry.flokkur,entry.hluti,entry.greining) d[(tag,entry.flokkur,entry.hluti,entry.greining)] += 1 return d
def tag_frequency(): """ Reads the BÍN corpus and counts instances of each tag (translated to IceNLP format) found in the corpus. Returns a tag=>count mapping. """ d = defaultdict(int) for entry in read_bin(): tag = translate_tag(CATEGORY_MAP[entry.flokkur], entry.flokkur, entry.hluti, entry.greining) d[(tag, entry.flokkur, entry.hluti, entry.greining)] += 1 return d
def _process_raw(self): suffix_tmp = {} #load OTB otb = {} adverbs = [] for word, tag, count in corpustools.read_otb(): otb[word] = count #pluck out any adverbs if tag[0]=='a': adverbs.append((word,tag,count)) #load BIN lemma_id = 0 for entries in corpustools.read_bin_grouped(filter=True): count = 0 category = CATEGORY_MAP[entries[0].flokkur] lemma = None coded_entries = [] for entry in entries: count += otb.get(entry.ordmynd, 0) #encode/preprocess entries tag = icepy_encode( translate_tag(category,entry.flokkur,entry.hluti,entry.greining) ) #add proper noun marker to tag if tag[0]=='n' and entry.lemma[0].isupper() and '-' not in tag: if tag[-1]=='g': tag += 's' else: tag += '-s' if not lemma: lemma = icepy_encode(entry.lemma.lower()) word = icepy_encode(entry.ordmynd.lower()) self.tag_count[tag] += 1 coded_entries.append((word,tag)) lemma_id += 1 self.id_lemma[lemma_id] = (lemma, category, count) self._prefix_fill(lemma_id,coded_entries,suffix_tmp) #inject morphemes for lemma,entries in corpustools.read_morphemes_grouped(): count = 0 #currently no count info available for morphemes category = 'm' lemma = icepy_encode(lemma) entries = [icepy_encode(e) for e in entries] for word,tag in entries: self.tag_count[tag] += 1 lemma_id += 1 self.id_lemma[lemma_id] = (lemma, category, count) self._prefix_fill(lemma_id,entries,suffix_tmp) #inject adverb tags from OTB for word,tag,count in adverbs: tag = icepy_encode(tag) frozenmap = (('', (tag,)),) self.tag_count[tag] += 1 if frozenmap in suffix_tmp: suffix_id = suffix_tmp[frozenmap] else: suffix_id = len(suffix_tmp) suffix_tmp[frozenmap] = suffix_id #reverse suffix and tag maps for suffixes,suffix_id in suffix_tmp.iteritems(): self.id_suffixes[suffix_id] = dict(suffixes) #inject adverbs from OTB, if they are not already in the maps for word,tag,count in adverbs: if not self._lookup_candidates(word,tag=tag): word = icepy_encode(word) lemma_id += 1 self.id_lemma[lemma_id] = (word, 'a', count) frozenmap = (('', (icepy_encode(tag),)),) suffix_id = suffix_tmp[frozenmap] self.prefix_map[word].append( (lemma_id, suffix_id, 1) ) #generate bloom filter self._generate_bloom()