Пример #1
0
    def _create(self):
        file = open(self.input_path, 'rb')
        su = SerialUnpickler(file)

        unique_dict = create_dict(su)

        # unique_dict = collections.defaultdict(dict)
        # index = collections.defaultdict(int)
        # for paragraph in tqdm(su, total=85663, desc='Processing %s' % str(self.__class__.__name__)):
        #     for sentence, sentence_orig in paragraph:
        #         for sample in sentence:
        #             for name, values in sample.features.items():
        #                 if isinstance(values, str) or isinstance(values, numbers.Number):
        #                     values = [values]
        #
        #                 for value in values:
        #                     if value not in unique_dict[name]:
        #                         unique_dict[name][value] = index[name]
        #                         index[name] += 1
        #
        #         for sample in sentence_orig:
        #             for name, values in sample.features.items():
        #                 if isinstance(values, str) or isinstance(values, numbers.Number):
        #                     values = [values]
        #
        #                 for value in values:
        #                     if value not in unique_dict[name]:
        #                         unique_dict[name][value] = index[name]
        #                         index[name] += 1

        file.close()

        file = open(self.output_path(), 'wb')
        pickle.dump(unique_dict, file)
        file.close()
Пример #2
0
    def learn(self, path, stop=-1, start=0, ids=None):
        lemma_count = collections.defaultdict(
            lambda: collections.defaultdict(int))
        if ids is None:
            ids = []
        su = SerialUnpickler(open(path, 'rb'), stop=stop, start=start, ids=ids)
        for paragraph in su:
            for sentence, sentence_orig in paragraph:
                for sample in sentence_orig:
                    # print(sample.features)
                    if 'lemma' in sample.features:  # some samples doesnt have lemma, because not on gold segmentation
                        lemma_count[(sample.features['token'],
                                     sample.features['label']
                                     )][sample.features['lemma']] += 1

        # print(lemma_count[('Morawieckiego','subst:sg:gen:m1')])
        # defaultdict(<class 'int'>, {'morawieckiego': 7, 'Morawiecki': 7, 'Morawieckiego': 1})

        for k, v in lemma_count.items():
            # try:
            #     xxx = sorted(v.items(), key=lambda x: (x[1], x[0]), reverse=True)
            #     if xxx[0][1]==xxx[1][1]:
            #         print(k, xxx)
            # except: pass

            # if len(v)>1: print(k, sorted(v.items(), key=lambda x: (x[1], x[0]), reverse=True))
            # TODO: lematyzacja w zależności od pozycji słowa w zdaniu - nie pierwsze słowo wtedy również wielka litera

            best = sorted(v.items(), key=lambda x: (x[1], x[0]),
                          reverse=True)[0]  # TODO kilka z taka sama statystyka
            self.lemmas[k] = best[0]
Пример #3
0
    def _create(self):
        file = open(self.input_path, 'rb')
        su = SerialUnpickler(file)

        file2 = open(self.output_path(), 'wb')
        sp = SerialPickler(file2)

        paragraph: Paragraph
        for paragraph in tqdm(su,
                              total=18484,
                              desc='Processing %s' %
                              str(self.__class__.__name__)):
            paragraph_sequence = []
            for sentence, sentence_gold in paragraph:
                sequence = list(sentence)
                for sample in sentence:
                    sample.features['tags4e3'] = create_token_features(
                        sample.features['token'], sample.features['tags'],
                        sample.features['space_before'])

                paragraph_sequence.append((sequence, sentence_gold))
            sp.add(paragraph_sequence)

        file.close()
        file2.close()
Пример #4
0
def count_sentences(path, ids=None):
    if ids is None:
        ids = []
    count = 0
    su = SerialUnpickler(open(path, 'rb'), ids=ids)
    for paragraph in su:
        for sentence in paragraph:
            count += 1
    return count
Пример #5
0
    def _create(self):
        file = open(self.input_path, 'rb')
        su = SerialUnpickler(file)

        file2 = open(self.output_path(), 'wb')
        sp = SerialPickler(file2)

        paragraph: Paragraph
        for paragraph in tqdm(su,
                              total=18484,
                              desc='Processing %s' %
                              str(self.__class__.__name__)):
            paragraph_sequence = preprocess_paragraph_preanalyzed(paragraph)

            sp.add(paragraph_sequence)

        file.close()
        file2.close()
Пример #6
0
    def _create(self):
        file = open(self.input_path, 'rb')
        su = SerialUnpickler(file)

        file2 = open(self.output_path(), 'wb')
        sp = SerialPickler(file2)

        import jsonlines
        jf = jsonlines.open(self.output_path() + '.jsonl', mode='w')

        paragraph: Paragraph
        for paragraph in tqdm(su,
                              total=18484,
                              desc='Processing %s' %
                              str(self.__class__.__name__)):
            paragraph_sequence = preprocess_paragraph_reanalyzed(paragraph)

            jf.write(serialize_sample_paragraph(paragraph_sequence))
            sp.add(paragraph_sequence)

        file.close()
        file2.close()
Пример #7
0
import math
from argparse import ArgumentParser

from krnnt.serial_pickle import SerialPickler, SerialUnpickler, count_samples

if __name__ == '__main__':
    parser = ArgumentParser(description='Split data')
    parser.add_argument('input_path', help='input path to data')
    parser.add_argument('output_path1', help='output path to data')
    parser.add_argument('output_path2', help='output path to data')
    parser.add_argument('ratio',
                        type=float,
                        help='ratio of data to write to the first output')

    args = parser.parse_args()

    num_data = count_samples(args.input_path)
    first_part = math.ceil(num_data * args.ratio)

    sp1 = SerialPickler(open(args.output_path1, 'wb'))
    sp2 = SerialPickler(open(args.output_path2, 'wb'))

    su = SerialUnpickler(open(args.input_path, 'rb'))
    for i, paragraph in enumerate(su):
        if i < first_part:
            sp1.add(paragraph)
        else:
            sp2.add(paragraph)
    sp1.close()
    sp2.close()
Пример #8
0
                sentence2.append({
                    'token': token.form,
                    'sep': token.space_before,
                    'tag': token.gold_form.tags,
                    'lemmas': [token.gold_form.lemma],
                })
        except AttributeError:  #omit sentence if some token does no have gold tag
            continue
    return paragraph2


if __name__ == '__main__':
    parser = ArgumentParser(
        description='Export data (before preprocessing) to format')
    parser.add_argument('input_path', help='input path to data')
    parser.add_argument('output_path', help='output path to data')
    parser.add_argument('-f', '--format', default='txt', help='output format')

    args = parser.parse_args()

    with open(args.input_path, 'rb') as file:
        su = SerialUnpickler(file)

        converter = get_output_converter(args.format)

        string = converter(
            (paragraph_to_result(paragraph_gold) for paragraph_gold in su))

        with open(args.output_path, 'w') as output_file:
            output_file.write(string)
Пример #9
0
def generate_arrays_from_file(path,
                              unique_features_dict,
                              feature_name,
                              label_name,
                              stop=-1,
                              start=0,
                              ids=None,
                              keep_unaligned=False,
                              keep_infinity=True):
    if ids is None:
        ids = []
    while 1:
        su = SerialUnpickler(open(path, 'rb'), stop=stop, start=start, ids=ids)
        for paragraph in su:
            for sentence, sentence_orig in paragraph:
                X_sentence = []
                y_sentence = []
                if not sentence: continue  # TODO

                same_segmentation = len(sentence) == len(
                    sentence_orig) and len([
                        sample
                        for sample in sentence if 'label' in sample.features
                    ])
                if (not same_segmentation) and not keep_unaligned:
                    continue

                if keep_unaligned and same_segmentation:
                    for sample in sentence:
                        X_sentence.append(
                            np.array(
                                k_hot(sample.features[feature_name],
                                      unique_features_dict[feature_name])))
                        if label_name == 'label':
                            y_sentence.append(
                                np.array(
                                    k_hot([sample.features[label_name]],
                                          unique_features_dict[label_name])))
                        else:
                            y_sentence.append(
                                np.array(
                                    k_hot(sample.features[label_name],
                                          unique_features_dict[label_name])))
                else:
                    for sample in sentence:
                        X_sentence.append(
                            np.array(
                                k_hot(sample.features[feature_name],
                                      unique_features_dict[feature_name])))
                    for sample in sentence_orig:
                        if label_name == 'label':
                            y_sentence.append(
                                np.array(
                                    k_hot([sample.features[label_name]],
                                          unique_features_dict[label_name])))
                        else:
                            y_sentence.append(
                                np.array(
                                    k_hot(sample.features[label_name],
                                          unique_features_dict[label_name])))

                # print len(X_sentence), len(y_sentence)
                yield (X_sentence, y_sentence, sentence, sentence_orig)
        if not keep_infinity: break
Пример #10
0
Reanalyze corpus with Maca.

E.g. prog train-gold.spickle train-reanalyzed.spickle
"""

if __name__ == '__main__':
    parser = ArgumentParser(usage=usage)
    parser.add_argument('file_path', type=str, help='paths to corpus')
    parser.add_argument('output_path', type=str, help='save path')
    parser.add_argument('--maca_config', default='morfeusz2-nkjp', help='Maca config')
    parser.add_argument('--toki_config_path', default='', help='Toki config path (directory)')
    args = parser.parse_args()

    file1 = open(args.file_path, 'rb')
    su_gold = SerialUnpickler(file1)

    file2 = open(args.output_path, 'wb')
    sp = SerialPickler(file2)

    maca_analyzer = MacaAnalyzer(args.maca_config)

    paragraph_gold: Paragraph
    for j, paragraph_gold in tqdm(enumerate(su_gold), total=18484, desc='Morphological analysis'):
        paragraph_raw = paragraph_gold.text()

        paragraph_reanalyzed = maca_analyzer.analyze(paragraph_raw)

        print('Number of sentences by Maca vs gold', len(paragraph_reanalyzed.sentences), len(paragraph_gold.sentences))

        paragraph_reanalyzed = align_paragraphs(paragraph_reanalyzed, paragraph_gold)
Пример #11
0
if __name__ == '__main__':
    parser = ArgumentParser(
        description=
        'Combines analyzed corpus with gold. Analyzed corpus must be with gold segmentation.'
    )
    parser.add_argument('gold_path', help='')
    parser.add_argument('analyzed_path', help='')
    parser.add_argument('output_path', help='')
    args = parser.parse_args()

    file_path1 = args.gold_path
    file_path2 = args.analyzed_path
    output_path = args.output_path

    file1 = open(file_path1, 'rb')
    su_gold = SerialUnpickler(file1)

    file2 = open(file_path2, 'rb')
    su_analyzed = SerialUnpickler(file2)

    file3 = open(output_path, 'wb')
    sp = SerialPickler(file3)

    for paragraph_gold in su_gold:
        for sentence_gold in paragraph_gold:
            paragraph_analyzed = next(su_analyzed.__iter__())
            assert len(paragraph_analyzed.sentences), 1
            sentence_analyzed = paragraph_analyzed.sentences[0]
            assert len(sentence_analyzed.tokens), len(sentence_gold.tokens)
            for token_gold, token_analyzed in zip(sentence_gold,
                                                  sentence_analyzed):