예제 #1
0
def test_codec():
    for args in LOOKUP_TEST:
        word = args[0]
        try:
            encoded = icepy_encode(word)
        except ValueError:
            print "Could not encode %s" % repr(word)
            continue
        decoded = icepy_decode(encoded)

        assert word==decoded
예제 #2
0
파일: morph.py 프로젝트: sverrirab/IcePy
    def expand(self, wordform, category=None, tag=None, single_part=True,
                     allow_morphemes=True):
        """
        Splits the input wordform into segments and returns a list of
        AnalysisMatch objects. Each AnalysisMatch object contains a *parts*
        attribute, which is a list of matches for each of the segments. Refer to
        the AnalysisMatch object documentation for information about it's
        properties.

        category and tag can be specified to filter in the same manner as in
        lookup().

        If you specify single_part=False the function will not return or attempt
        to lookup the entire word form.

        By default, non-standalone morphemes (category 'm') are included in
        expansions if found. You may specify allow_morphemes=False to disable
        these, to only get independent word segments.
        """
        self.expand_count += 1

        if not self._check_input(wordform,category,tag): return WordformAnalysis([])

        #special case for nouns, a compound word segment should not be a proper noun
        if tag: tag = tag.split('-')[0]

        wordform = wordform.replace('-','')

        instances   = defaultdict(list)

        word_len = len(wordform)
        done = False
        self.timing.start('expand_loop')
        for sublen in range(word_len,1,-1):
            index = -1
            while index+sublen < word_len:
                index += 1
                word = wordform[index:index+sublen]

                if not single_part and sublen==word_len: continue
                if index+sublen==word_len-1: continue

                #make sure last segment matches the specified filter
                if index+sublen==word_len:
                    lookup = self._lookup_candidates(word, category, tag)
                else:
                    lookup = self._lookup_candidates(word, None, None)

                for match in lookup:
                    if not allow_morphemes and match.match_tag[0]=='m': continue
                    instances[index].append(ExpansionPart(index,word,match))

            if done: break
        self.timing.end('expand_loop')

        #for index,parts in instances.iteritems():
        #    print index, ''.join([str(p)+' ' for p in parts])
        if not instances: return WordformAnalysis([])
        
        """find all valid matches"""
        matches = []
        #loop through the "roots" (i.e. instances with index 0)
        if 0 not in instances: return WordformAnalysis([])
        self.timing.start('match_traverse')
        for root in instances[0]:
            if root.index > 0: break

            candidate = [root]
            #special case of entire word being found on first index
            if root.end == word_len:
                matches.append(ExpansionMatch(candidate))

            #recursively traverse the "tree" of parts to find all sets which add up
            #to the full length of the word
            _traverse_parts(instances, matches, root.end, word_len, candidate, [0])
        self.timing.end('match_traverse')

        if not matches: return WordformAnalysis([])

        #filter matches and set attributes for AnalysisMatch objects
        results = []
        for match in matches:
            if not len(match): continue
            if not single_part and len(match.parts) < 2: continue

            last_tag = match[-1].analysis.match_tag
            if len(match)==1 and last_tag[0]=='m': continue
            
            #a compound word should not end with a proper noun. at least that's
            #not a supported case for now.
            if last_tag[0]=='n' and len(last_tag)>2 and last_tag[-2]=='-': continue

            result_tag = last_tag if last_tag[0]!='m' else last_tag[3:]

            prefix = ''.join([m.analysis.word for m in match[:-1]])
            lemma  = prefix + match[-1].analysis.lemma
            prefix = prefix + match[-1].analysis.prefix
            suffix = icepy_encode(wordform[len(prefix):])

            result = AnalysisMatch(prefix, suffix, 0, lemma, match[-1].analysis.otb_count,
                               match[-1].analysis.suffix_id, result_tag,
                               'expanded_lookup', match.parts)

            result.tag_count = self.tag_count.get(result_tag,0)
            result.tag_pattern_count = self.expansion_tag_patterns.get(result.tags(),0)

            results.append(result)

        return WordformAnalysis(results)
예제 #3
0
파일: morph.py 프로젝트: sverrirab/IcePy
    def inflection_analysis(self, input):
        """ Generate a InflectionAnalysis object, given an AnalysisMatch object
        or a list/tuple of words.

        The InflectionAnalysis object contains a breakdown of how the given word
        is declined. It includes a stem, a list of stem-variables
        and corresponding lists of tags and inflectional suffixes.
        """
        wordforms = []
        tags = []
        if isinstance(input, AnalysisMatch):
            for suffix,tagset in self.id_suffixes[input.suffix_id].iteritems():
                for tag in tagset:
                    wordforms.append(input.prefix+suffix)
                    tags.append(tag)
        elif isinstance(input, (list,tuple)):
            for word in input:
                if isinstance(word, str):
                    wordforms.append(word)
                elif isinstance(word, unicode):
                    wordforms.append(icepy_encode(word))
                else:
                    raise ValueError('input list/tuple must only contain strings')
            tags = ['' for w in wordforms]
        else:
            raise ValueError('input object must be AnalysisMatch instance or list/tuple of wordforms')

        wordforms.sort(key=lambda x: len(x))

        base = wordforms[0]
        pattern = r''
        
        variables = []
        for i in range(len(wordforms)): variables.append([])

        running_vars   = [None for i in range(len(wordforms))]
        suffixes       = [None for i in range(len(wordforms))]
        last_constant  = -1
        for x,char in enumerate(base):
            #determine if character at offset X is a constant (that is, appears
            #predictably in all wordforms)
            range_start = last_constant + 1
            range_end   = x + 1
            is_constant = True

            for w,word in enumerate(wordforms[1:]):
                index = word[range_start:range_end].find(char)
                #print word, char, range_end, range_start, index
                if index < 0:
                    is_constant = False

            if not is_constant:
                for w,word in enumerate(wordforms[1:]):
                    if running_vars[w+1] is None:
                        running_vars[w+1] = word[x]
                    else:
                        running_vars[w+1] += word[x]
            else:
                for w,word in enumerate(wordforms[1:]):
                    index = word[range_start:range_end].find(char)
                    suffixes[w+1] = word[range_start+index+1:]

            if is_constant:
                if x > range_start:
                    var_set = set()
                    for i,v in enumerate(running_vars):
                        variables[i].append(v)
                        if v is not None: var_set.add(v)
                    vars = r'|'.join( sorted(var_set,key=lambda x: len(x),reverse=True) )
                    pattern += r'(%s)' % vars
                pattern += char
                last_constant = x
                running_vars = [None for i in range(len(wordforms))]
            else:
                if running_vars[0] is None:
                    running_vars[0] = char
                else:
                    running_vars[0] += char

        suffixes[0] = base[last_constant+1:]

        return InflectionAnalysis(pattern, suffixes, variables, tags, wordforms)
예제 #4
0
파일: morph.py 프로젝트: sverrirab/IcePy
    def _lookup_candidates(self, wordform, category=None, tag=None):
        self.lookup_count += 1

        #encode input strings
        try:
            wordform = icepy_encode(wordform.lower())
            if tag:
                tag = icepy_encode(tag.lower())
                category = tag[0]
        except ValueError:
            #print "warning: could not encode word/tag %s/%s" % (repr(wordform),repr(tag))
            return []

        #first check if wordform is in the bloom filter. this is primarily used
        #to increase performance in expansions, as they frequently include a
        #large number of lookups of non-existent words. this may return a
        #false positive but then the prefix lookup will simply return an empty set.
        self.timing.start('bloom_lookup')
        in_bloom = True
        if self.bloom and not self.bloom.InFilter(wordform):
            in_bloom = False
            self.bloom_negatives += 1
        self.timing.end('bloom_lookup')

        if not in_bloom: return []

        #look for prefix in prefix map.
        self.timing.start('prefix_lookup')
        prefix = wordform
        candidates = []
        while True:
            if prefix in self.prefix_map:
                #search for the word in the selected prefixes suffix map
                word_suffix = wordform[len(prefix):]

                self.timing.start('lookup_prefix_loop')
                for candidate_id,candidate_suffix_id,wordform_count in self.prefix_map[prefix]:
                    suffixmap = self.id_suffixes[candidate_suffix_id]

                    tags = suffixmap.get(word_suffix, None)
                    if not tags: continue

                    lemma, lemma_category, otb_count = self.id_lemma[candidate_id]

                    if lemma_category=='m' and tag and tag[0]!='m':
                        if tag not in [t[3:] for t in tags]: continue
                        tags = [tags[0][:3] + tag]
                    elif tag:
                        if tag not in tags: continue
                        tags = [tag]
                    elif category and lemma_category not in (category,'m'):
                        continue

                    candidates.extend([
                        AnalysisMatch(prefix, word_suffix, candidate_id, lemma,
                        otb_count, candidate_suffix_id, candidate_tag, 'lookup')
                        for candidate_tag in tags
                    ])
                self.timing.end('lookup_prefix_loop')
                        
            if not prefix or len(prefix) < self.stem_minimum_length: break
            prefix = prefix[:-1]

        if not candidates:
            self.lookup_negatives += 1

        self.timing.end('prefix_lookup')
        return candidates
예제 #5
0
파일: morph.py 프로젝트: sverrirab/IcePy
    def _process_raw(self):
        suffix_tmp = {}

        #load OTB
        otb = {}
        adverbs = []
        for word, tag, count in corpustools.read_otb():
            otb[word] = count
            #pluck out any adverbs
            if tag[0]=='a': adverbs.append((word,tag,count))

        #load BIN
        lemma_id = 0
        for entries in corpustools.read_bin_grouped(filter=True):
            count = 0
            category = CATEGORY_MAP[entries[0].flokkur]
            lemma = None

            coded_entries = []
            for entry in entries:
                count   += otb.get(entry.ordmynd, 0)

                #encode/preprocess entries
                tag   = icepy_encode(
                            translate_tag(category,entry.flokkur,entry.hluti,entry.greining)
                        )
                #add proper noun marker to tag
                if tag[0]=='n' and entry.lemma[0].isupper() and '-' not in tag:
                    if tag[-1]=='g':
                        tag += 's'
                    else:
                        tag += '-s'

                if not lemma: lemma = icepy_encode(entry.lemma.lower())
                word  = icepy_encode(entry.ordmynd.lower())
                
                self.tag_count[tag] += 1
                coded_entries.append((word,tag))

            lemma_id += 1

            self.id_lemma[lemma_id] = (lemma, category, count)
            self._prefix_fill(lemma_id,coded_entries,suffix_tmp)

        #inject morphemes
        for lemma,entries in corpustools.read_morphemes_grouped():
            count = 0 #currently no count info available for morphemes
            category = 'm'
            lemma = icepy_encode(lemma)
            entries = [icepy_encode(e) for e in entries]

            for word,tag in entries:
                self.tag_count[tag] += 1

            lemma_id += 1
            self.id_lemma[lemma_id] = (lemma, category, count)
            self._prefix_fill(lemma_id,entries,suffix_tmp)

        #inject adverb tags from OTB
        for word,tag,count in adverbs:
            tag = icepy_encode(tag)
            frozenmap = (('', (tag,)),)
            self.tag_count[tag] += 1    
            if frozenmap in suffix_tmp:
                suffix_id = suffix_tmp[frozenmap]
            else:
                suffix_id = len(suffix_tmp)
                suffix_tmp[frozenmap] = suffix_id

        #reverse suffix and tag maps
        for suffixes,suffix_id in suffix_tmp.iteritems():
            self.id_suffixes[suffix_id] = dict(suffixes)

        #inject adverbs from OTB, if they are not already in the maps
        for word,tag,count in adverbs:
            if not self._lookup_candidates(word,tag=tag):
                word = icepy_encode(word)
                lemma_id += 1
                self.id_lemma[lemma_id] = (word, 'a', count)

                frozenmap = (('', (icepy_encode(tag),)),)
                suffix_id = suffix_tmp[frozenmap]

                self.prefix_map[word].append( (lemma_id, suffix_id, 1) )

        #generate bloom filter
        self._generate_bloom()