예제 #1
0
파일: grammar.py 프로젝트: xhydras/scan
 def __init__(self, load=True):
     super(GrammarCorrector, self).__init__()
     self.data_path = os.path.abspath(os.path.join(settings.DATA_PATH, PICKLE_FILENAME))
     self.model = PerceptronTagger()
     if not load:
         tags, self.good_sequences = self.get_sequences(self.data)
         pickle.dump(self.good_sequences, open(self.data_path, 'wb'))
     else:
         self.good_sequences = pickle.load(open(self.data_path, 'r'))
예제 #2
0
파일: grammar.py 프로젝트: fototo/scan
 def __init__(self, load=True):
     super(GrammarCorrector, self).__init__()
     self.data_path = os.path.abspath(os.path.join(settings.DATA_PATH, PICKLE_FILENAME))
     self.model = PerceptronTagger()
     if not load:
         tags, self.good_sequences = self.get_sequences(self.data)
         pickle.dump(self.good_sequences, open(self.data_path, "wb"))
     else:
         self.good_sequences = pickle.load(open(self.data_path, "r"))
예제 #3
0
파일: grammar.py 프로젝트: xhydras/scan
class GrammarCorrector(BaseCorrector):
    """
    Adapted from http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    """

    lengths = [4]
    min_count = 3

    def __init__(self, load=True):
        super(GrammarCorrector, self).__init__()
        self.data_path = os.path.abspath(os.path.join(settings.DATA_PATH, PICKLE_FILENAME))
        self.model = PerceptronTagger()
        if not load:
            tags, self.good_sequences = self.get_sequences(self.data)
            pickle.dump(self.good_sequences, open(self.data_path, 'wb'))
        else:
            self.good_sequences = pickle.load(open(self.data_path, 'r'))

    def find_bad_sequences(self, sequences):
        bad_sequences = set([])
        for k in sequences:
            count = self.good_sequences.get(k, 0)
            if count < self.min_count:
                bad_sequences.update([k])
        return bad_sequences

    def correct_string(self, string_corpus):
        tags, sequences = self.get_sequences(string_corpus)
        bad_sequences = self.find_bad_sequences(sequences)
        error_ratio = len(bad_sequences) / (float(len(sequences)) + 1)
        new_tags = []
        for t in tags:
            new_tags.append([t[0], t[1], False])

        for l in self.lengths:
            for i in range(l, len(new_tags)):
                nt = new_tags[(i-l):i]
                seq = "_".join([t[1] for t in nt])
                if seq in bad_sequences:
                    for j in range((i-l), i):
                        new_tags[j][2] = True
        for i in range(1, len(new_tags)-1):
            tag = new_tags[i]
            if tag[2] and not new_tags[i-1][2]:
                new_tags[i][0] = "<grammar>" + new_tags[i][0]
            if tag[2] and not new_tags[i+1][2]:
                new_tags[i][0] += "</grammar>"

        words = [nt[0] for nt in new_tags]
        return error_ratio, " ".join(words), new_tags


    def tag(self, string_corpus):
        return self.model.tag(string_corpus)

    def get_pos(self, tags):
        return [t[1] for t in tags]

    def get_sequences(self, string_corpus):
        tags = self.tag(string_corpus)
        pos = self.get_pos(tags)
        sequences = self._get_sequences(pos, self.lengths)
        return tags, sequences

    def _get_sequences(self, pos, lengths):
        sequences = {}
        for l in lengths:
            sequences.update(self._get_sequence(pos, l))
        return sequences

    def _get_sequence(self, pos, length):
        sequences = {}
        if length >= len(pos):
            return sequences

        for i in range(0, len(pos) - length):
            if i % 100000 == 0 and i != 0:
                log.debug("{0}% done with sequence.".format((i/float(len(pos))) * 100))
            val = "_".join(pos[i:i+length])
            if val not in sequences:
                sequences[val] = 0
            sequences[val] += 1
        return sequences
예제 #4
0
파일: grammar.py 프로젝트: fototo/scan
class GrammarCorrector(BaseCorrector):
    """
    Adapted from http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    """

    lengths = [4]
    min_count = 3

    def __init__(self, load=True):
        super(GrammarCorrector, self).__init__()
        self.data_path = os.path.abspath(os.path.join(settings.DATA_PATH, PICKLE_FILENAME))
        self.model = PerceptronTagger()
        if not load:
            tags, self.good_sequences = self.get_sequences(self.data)
            pickle.dump(self.good_sequences, open(self.data_path, "wb"))
        else:
            self.good_sequences = pickle.load(open(self.data_path, "r"))

    def find_bad_sequences(self, sequences):
        bad_sequences = set([])
        for k in sequences:
            count = self.good_sequences.get(k, 0)
            if count < self.min_count:
                bad_sequences.update([k])
        return bad_sequences

    def correct_string(self, string_corpus):
        tags, sequences = self.get_sequences(string_corpus)
        bad_sequences = self.find_bad_sequences(sequences)
        error_ratio = len(bad_sequences) / (float(len(sequences)) + 1)
        new_tags = []
        for t in tags:
            new_tags.append([t[0], t[1], False])

        for l in self.lengths:
            for i in range(l, len(new_tags)):
                nt = new_tags[(i - l) : i]
                seq = "_".join([t[1] for t in nt])
                if seq in bad_sequences:
                    for j in range((i - l), i):
                        new_tags[j][2] = True
        for i in range(1, len(new_tags) - 1):
            tag = new_tags[i]
            if tag[2] and not new_tags[i - 1][2]:
                new_tags[i][0] = "<grammar>" + new_tags[i][0]
            if tag[2] and not new_tags[i + 1][2]:
                new_tags[i][0] += "</grammar>"

        words = [nt[0] for nt in new_tags]
        return error_ratio, " ".join(words), new_tags

    def tag(self, string_corpus):
        return self.model.tag(string_corpus)

    def get_pos(self, tags):
        return [t[1] for t in tags]

    def get_sequences(self, string_corpus):
        tags = self.tag(string_corpus)
        pos = self.get_pos(tags)
        sequences = self._get_sequences(pos, self.lengths)
        return tags, sequences

    def _get_sequences(self, pos, lengths):
        sequences = {}
        for l in lengths:
            sequences.update(self._get_sequence(pos, l))
        return sequences

    def _get_sequence(self, pos, length):
        sequences = {}
        if length >= len(pos):
            return sequences

        for i in range(0, len(pos) - length):
            if i % 100000 == 0 and i != 0:
                log.debug("{0}% done with sequence.".format((i / float(len(pos))) * 100))
            val = "_".join(pos[i : i + length])
            if val not in sequences:
                sequences[val] = 0
            sequences[val] += 1
        return sequences