示例#1
0
文件: extractor.py 项目: fseasy/cws
    def build_lexicon_match_state(self, instance):
        '''
        for every unigram , record :
        as a word head , the word 's max length
        as a word middle , the word's max length
        as a word end , the word's max length
        '''
        instance_len = len(instance)
        match_state = [[1] * 3
                       for i in range(instance_len)]  #! minimum length is 1
        for i in range(instance_len):
            j = min(i + LEXICON_MATCH_MAX_LENGTH, instance_len - 1)
            while j > i:
                test_word = WSAtomTranslator.trans_atom_gram_list2unicode_line(
                    instance[i:j + 1])
                if test_word in self.lexicon:
                    word_len = j - i + 1
                    #! max length as the word head
                    match_state[i][0] = max(match_state[i][0], word_len)
                    #! max length as the word middle
                    for interval in range(i + 1, j):
                        match_state[interval][1] = max(
                            match_state[interval][1], word_len)
                    #! max length as the word end
                    match_state[j][2] = max(match_state[j][2], word_len)
                    break
                j -= 1

        return match_state
示例#2
0
文件: segmentor.py 项目: fseasy/cws
 def _processing_raw_training_data2unigrams_and_tags(self):
     '''
     from lines data(WSAtom wrapped )to trainning data(ws needed)
     [ inner class function ]
     logic :
         we process self.raw_training_data . 
         and set self.training_unigrams_data , self.training_tags_data
         unigram_line_list : list , elements is also a list . the most inner element is the unigram . 
                             => [ [WSAtom(unigram) , WSAtom(unigram) , ...] , ...  ] 
         tags_list : list of list . most inner element is tag . => [ [tag_b , tag_m , ...] , ...]
 
     '''
     logging.info("processing raw training data to unigrams and tags .")
     if self.raw_training_data is None:
         logging.error("failed!")
         return
     self.training_unigrams_data = []
     self.training_tags_data = []
     for sentence in self.raw_training_data:
         unigram_line, tags = Segmentor._processing_one_segmented_WSAtom_instance2unigrams_and_tags(
             sentence)
         self.training_tags_data.append(tags)
         self.training_unigrams_data.append(unigram_line)
     if DEBUG:
         logger.debug("the 1st line : %s" % (u" ".join(
             [unicode(atom)
              for atom in self.training_unigrams_data[0]]).encode('utf8')))
         logger.debug("the 1st tag list : " + " ".join(
             [TAG_NAME_TRANS[tag] for tag in self.training_tags_data[0]]))
         logger.debug("the 1st origin seg line : " + " ".join([
             WSAtomTranslator.trans_atom_gram_list2unicode_line(
                 atom_list).encode("utf8")
             for atom_list in self.raw_training_data[0]
         ]))
示例#3
0
文件: segmentor.py 项目: memeda/cws
 def _processing_raw_training_data2unigrams_and_tags(self) :
     '''
     from lines data(WSAtom wrapped )to trainning data(ws needed)
     [ inner class function ]
     logic :
         we process self.raw_training_data . 
         and set self.training_unigrams_data , self.training_tags_data
         unigram_line_list : list , elements is also a list . the most inner element is the unigram . 
                             => [ [WSAtom(unigram) , WSAtom(unigram) , ...] , ...  ] 
         tags_list : list of list . most inner element is tag . => [ [tag_b , tag_m , ...] , ...]
 
     '''
     logging.info("processing raw training data to unigrams and tags .")
     if self.raw_training_data is None : 
         logging.error("failed!")
         return
     self.training_unigrams_data = []
     self.training_tags_data = []
     for sentence in self.raw_training_data :
         unigram_line , tags = Segmentor._processing_one_segmented_WSAtom_instance2unigrams_and_tags(sentence)
         self.training_tags_data.append(tags)
         self.training_unigrams_data.append(unigram_line)
     if DEBUG :
         logger.debug("the 1st line : %s" %( u" ".join(
                      [ unicode(atom) for atom in self.training_unigrams_data[0]] ).encode('utf8') ))
         logger.debug("the 1st tag list : " + " ".join([ TAG_NAME_TRANS[tag] for tag in self.training_tags_data[0] ]))
         logger.debug("the 1st origin seg line : " + " ".join(
                      [WSAtomTranslator.trans_atom_gram_list2unicode_line(atom_list).encode("utf8") 
                      for atom_list in self.raw_training_data[0]]))
示例#4
0
文件: extractor.py 项目: memeda/cws
    def build_lexicon_match_state(self , instance) :
        '''
        for every unigram , record :
        as a word head , the word 's max length
        as a word middle , the word's max length
        as a word end , the word's max length
        '''
        instance_len = len(instance)
        match_state = [ [ 1 ] * 3 for i in range(instance_len) ] #! minimum length is 1
        for i in range(instance_len) :
            j = min( i + LEXICON_MATCH_MAX_LENGTH , instance_len - 1 )
            while j > i :
                test_word = WSAtomTranslator.trans_atom_gram_list2unicode_line(instance[i:j+1])
                if test_word in self.lexicon :
                    word_len = j - i + 1
                    #! max length as the word head 
                    match_state[i][0] = max( match_state[i][0] , word_len )
                    #! max length as the word middle
                    for interval in range(i+1 , j) :
                        match_state[interval][1] = max( match_state[interval][1] , word_len )
                    #! max length as the word end
                    match_state[j][2] = max( match_state[j][2] , word_len )
                    break
                j -= 1

        return match_state
示例#5
0
文件: segmentor.py 项目: memeda/cws
 def _build_inner_lexicon(self , threshold=1.) :
     logging.info("build inner lexicon from training data .")
     if self.raw_training_data is None :
         logging.error('failed')
         return
     words_counter = Counter()
     for raw_instance in self.raw_training_data :
         #! len > 1 to ensure it is a lexicon
         unicode_instance = [ WSAtomTranslator.trans_atom_gram_list2unicode_line(atom_instance_gram_list) 
                              for atom_instance_gram_list in raw_instance if len(atom_instance_gram_list) > 1 ]
         words_counter.update(unicode_instance)
     total_freq = sum(words_counter.viewvalues())
     lexicon_list = []
     if threshold < 1. :
         ##! a fast and clearly implementation is using Counter.most_common(N) to return the threshold number words . 
         ##! but it clearly will cause some words  were added to lexicon dict while some ohter words with the same freq is cut off at tail . it is bad.
         ##! So do following logic to keep fair .
         ##! strategy changed ! the threshold freq is also accepted (orginal , we reject words with the edge frequnce )! 
         threshold_num = int( total_freq * threshold )
         pre_freq = INF
         words_has_same_freq = []
         freq_counter = 0
         for word , freq in words_counter.most_common() :
             if freq != pre_freq :
                 lexicon_list.extend(words_has_same_freq)
                 words_has_same_freq = []
                 pre_freq = freq
                 if freq_counter > threshold_num :
                     break
             words_has_same_freq.append(word)
             freq_counter += freq
         else :
             lexicon_list.extend(words_has_same_freq) #! if it is because all word is iterated , we should apend it !
     else :
         lexicon_list = words_counter.keys()
     logging.info( "inner lexicon info : %d/%d" %( len(lexicon_list) , len(words_counter) )  )
     
     if DEBUG :
         freq_in_lexicon = 0
         min_freq = INF
         for word in lexicon_list :
             word_freq = words_counter[word]
             freq_in_lexicon += word_freq
             if word_freq < min_freq :
                 min_freq = word_freq
         logger.debug("origin words count : " + str(len(words_counter)))
         logger.debug("lexicon count : " + str(len(lexicon_list)))
         logger.debug( ("thredhold num is %d , actually total freqency in lexicon is %d(total frequency of all words : %d ),"
                        "minimun frequency in lexicon is %s , frequency ratio is %.2f%% , word count ratio is %.2f%%" %( 
                         threshold_num , freq_in_lexicon , total_freq , min_freq , 
                         freq_in_lexicon / float(total_freq) , len(lexicon_list) / float(len(words_counter)) )) 
                     )
     self.inner_lexicon =  dict.fromkeys(lexicon_list) #! to make it more efficient 
示例#6
0
class DatasetHandler(object):
    @staticmethod
    def is_readable(path):
        return (os.access(path, os.F_OK) and os.access(path, os.R_OK))

    @staticmethod
    def is_writeable(path):
        if os.access(path, os.F_OK):
            return os.access(path, os.W_OK)
        #! path not exists , check dir path is writeable
        dir_path = os.path.dirname(
            os.path.abspath(path))  #!! os.path.abspath is needed !
        #~  or an empty str is returned for dirname for a relative path
        return (os.access(dir_path, os.F_OK) and os.access(dir_path, os.W_OK))

    @staticmethod
    def get_file_encoding(f):
        '''
        get file's encoding ; sample and naive implementation
        Args :
            f : file 
        Returns :
            encoding : str ;
        Attention :
            if failed , will cause to Exit !
        '''
        cur_g = f.tell()
        line = f.readline()
        f.seek(cur_g)
        encoding_list = []
        if f.encoding is not None:
            encoding_list.append(f.encoding)
        encoding_list.extend(["utf8", "gb18030"])
        uline = ""
        for encoding in encoding_list:
            try:
                uline = line.decode(encoding)
            except:
                uline = ""
                continue
            return encoding
        logging.error("failed to decode the training data . file path : '%s'" %
                      (f.name))
        print >> sys.stderr, "Exit"
        exit(1)

    @staticmethod
    def read_training_data(tf):
        '''
        read lines from training dataset
        Args: 
            tf : file object of training data
    
        Returns :
            data_lines : lines of dataset , each line is also a list , every element is also a list !
                        the most inner element is WSAtom .
                        => [ [ [ WSAtom("like") , WSAtom("我") , ... ] , [WSAtom("一") , WSAtom("样")] ,...  ] ]
                        what is this ? -> the most inner list , is same as the N-grams of chars ! so as for every word , it is represented by
                                          a list of WSAtom . upper list is the sentence , the most outer is the list of sentence
                        Why use WSAtom ? -> because we want an English word to be a `single representation` instead of `list of letters` ! 
        '''
        if type(tf) != file:
            try:
                tf = open(tf)
            except IOError, e:
                traceback.print_exc()
                exit(1)
        logging.info("reading training data from '%s'" % (tf.name))
        data_lines = []
        encoding = DatasetHandler.get_file_encoding(tf)
        WSAtom.set_encoding(encoding)
        for line in tf:
            line = line.strip()
            if len(line) == 0:
                continue
            uline = ""
            try:
                uline = line.decode(encoding)
            except:
                logging.warning("decoding dataset error : %s " % (line))
                continue
            uline_parts = uline.split()
            atom_list = []
            for uline_part in uline_parts:
                atom_list.append(
                    WSAtomTranslator.trans_unicode_list2atom_gram_list(
                        uline_part))
            data_lines.append(atom_list)
        logging.info("%d lines read done ." % (len(data_lines)))
        tf.close()
        return data_lines
示例#7
0
        WSAtom.set_encoding(encoding)
        for line in df:
            line = line.strip()
            #if len(line) == 0 : #! still handle it !
            #    continue
            try:
                uline = line.decode(encoding)
            except UnicodeDecodeError, e:
                logging.warning("decoding dataset error : %s " % (line))
                #continue
                uline = ""  #! still handle it !
            uline_parts = uline.split()
            atom_list = []
            for uline_part in uline_parts:
                atom_list.append(
                    WSAtomTranslator.trans_unicode_list2atom_gram_list(
                        uline_part))
            yield atom_list
        df.close()

    @staticmethod
    def read_predict_data(df):
        '''
        An Iteration generator for predict data
        Args :
            df : file , or a path str
        Returns :
            atom_list : [ WSAtom , WSAtom , ... ] 
            separator_position : list , the position where seperator exists
        '''
        if not isinstance(df, file):
            try:
示例#8
0
文件: segmentor.py 项目: fseasy/cws
    def _build_inner_lexicon(self, threshold=1.):
        logging.info("build inner lexicon from training data .")
        if self.raw_training_data is None:
            logging.error('failed')
            return
        words_counter = Counter()
        for raw_instance in self.raw_training_data:
            #! len > 1 to ensure it is a lexicon
            unicode_instance = [
                WSAtomTranslator.trans_atom_gram_list2unicode_line(
                    atom_instance_gram_list)
                for atom_instance_gram_list in raw_instance
                if len(atom_instance_gram_list) > 1
            ]
            words_counter.update(unicode_instance)
        total_freq = sum(words_counter.viewvalues())
        lexicon_list = []
        if threshold < 1.:
            ##! a fast and clearly implementation is using Counter.most_common(N) to return the threshold number words .
            ##! but it clearly will cause some words  were added to lexicon dict while some ohter words with the same freq is cut off at tail . it is bad.
            ##! So do following logic to keep fair .
            ##! strategy changed ! the threshold freq is also accepted (orginal , we reject words with the edge frequnce )!
            threshold_num = int(total_freq * threshold)
            pre_freq = INF
            words_has_same_freq = []
            freq_counter = 0
            for word, freq in words_counter.most_common():
                if freq != pre_freq:
                    lexicon_list.extend(words_has_same_freq)
                    words_has_same_freq = []
                    pre_freq = freq
                    if freq_counter > threshold_num:
                        break
                words_has_same_freq.append(word)
                freq_counter += freq
            else:
                lexicon_list.extend(
                    words_has_same_freq
                )  #! if it is because all word is iterated , we should apend it !
        else:
            lexicon_list = words_counter.keys()
        logging.info("inner lexicon info : %d/%d" %
                     (len(lexicon_list), len(words_counter)))

        if DEBUG:
            freq_in_lexicon = 0
            min_freq = INF
            for word in lexicon_list:
                word_freq = words_counter[word]
                freq_in_lexicon += word_freq
                if word_freq < min_freq:
                    min_freq = word_freq
            logger.debug("origin words count : " + str(len(words_counter)))
            logger.debug("lexicon count : " + str(len(lexicon_list)))
            logger.debug((
                "thredhold num is %d , actually total freqency in lexicon is %d(total frequency of all words : %d ),"
                "minimun frequency in lexicon is %s , frequency ratio is %.2f%% , word count ratio is %.2f%%"
                % (threshold_num, freq_in_lexicon, total_freq,
                   min_freq, freq_in_lexicon / float(total_freq),
                   len(lexicon_list) / float(len(words_counter)))))
        self.inner_lexicon = dict.fromkeys(
            lexicon_list)  #! to make it more efficient
示例#9
0
        encoding = DatasetHandler.get_file_encoding(df)
        WSAtom.set_encoding(encoding)
        for line in df :
            line = line.strip()
            #if len(line) == 0 : #! still handle it !
            #    continue
            try :
                uline = line.decode(encoding)
            except UnicodeDecodeError , e :
                logging.warning("decoding dataset error : %s " %(line))
                #continue
                uline = "" #! still handle it !
            uline_parts = uline.split()
            atom_list = []
            for uline_part in uline_parts :
                atom_list.append(WSAtomTranslator.trans_unicode_list2atom_gram_list(uline_part))
            yield atom_list
        df.close()

    @staticmethod
    def read_predict_data(df) :
        '''
        An Iteration generator for predict data
        Args :
            df : file , or a path str
        Returns :
            atom_list : [ WSAtom , WSAtom , ... ] 
            separator_position : list , the position where seperator exists
        '''
        if not isinstance(df , file) :
            try :