Пример #1
0
def test_align():
    cand = ["U.S", ".", "policy"]
    gold = ["U.S.", "policy"]
    assert align(cand, gold) == [0, None, 1]
    cand = ["your", "stuff"]
    gold = ["you", "r", "stuff"]
    assert align(cand, gold) == [None, 2]
    cand = [u'i', u'like', u'2', u'guys', u'   ', u'well', u'id', u'just',
            u'come', u'straight', u'out']
    gold = [u'i', u'like', u'2', u'guys', u'well', u'i', u'd', u'just', u'come',
            u'straight', u'out']
    assert align(cand, gold) == [0, 1, 2, 3, None, 4, None, 7, 8, 9, 10]
Пример #2
0
def align_tokens_and_ud(token_score_tuples, ud_output):
    result = []
    score_tuples_a = []
    ud_out_b = []
    for tuple in token_score_tuples:
        score_tuples_a.append(tuple[0].lower())
    for row in ud_output:
        ud_out_b.append(row[1].lower())
    alignment = align(score_tuples_a, ud_out_b)
    cost, a2b, b2a, a2b_multi, b2a_multi = alignment

    debug_print = False
    for tuple_index in range(len(token_score_tuples)):
        if a2b[tuple_index] != -1:
            result.append((token_score_tuples[tuple_index][0],
                           token_score_tuples[tuple_index][1],
                           ud_output[a2b[tuple_index]][3],
                           ud_output[a2b[tuple_index]][4]))
        else:
            debug_print = True

    # if len(a2b_multi) > 0:
    #     debug_print = True
    #     print('a2b_multi', a2b_multi)
    # if len(b2a_multi) > 0:
    #     debug_print = True
    #     print('b2a_multi', b2a_multi)

    # if debug_print:
    #     print('a', score_tuples_a)
    #     print('a2b', a2b)
    #     print('b', ud_out_b)
    #     print('b2a', b2a)

    return result
Пример #3
0
def test_line():
    line = 'The switches between clarity and intoxication gave me a headache, but at least the silver-haired faery’s explanation of the queens’ “gifts” helped me understand why I could want to wrap my legs around a creature who terrified me.'

    spacy_doc = nlp(line.lower())
    spacy_tokens = [str(token) for token in spacy_doc]
    spacy_tokens_pos = [token.pos_ for token in spacy_doc]
    bert_tokens = tokenizer.tokenize(line)
    diff = align(bert_tokens, spacy_tokens)[0]

    print('Spacy : {}'.format(spacy_tokens))
    print('BERT  : {}'.format(bert_tokens))
Пример #4
0
def get_corr_ind(tok1: List[str], tok2: List[str],
                 tok1_idx: List[List[int]]) -> List[List[int]]:
    """
    Aligns two different tokenizations
    and outputs the tok2_idx.
    tok1: tokenized sents via method1
    tok2: tokenized sents via method2
    tok1_idx: indices of tok1
    output: tok2_idx: indices of tok2
    based on tok1_idx
    """
    cost, a2b, b2a, a2b_multi, b2a_multi = align(tok1, tok2)
    # If aligned no pains
    # can directly return tok1_idx
    if cost == 0:
        return tok1_idx

    # Else create tok2_idx
    tok2_idx = []
    for t1_idx in tok1_idx:
        t2_idx = []
        for t in t1_idx:
            # If the tok1_idx corresponds
            # to one single token of tok2
            # just use that
            if a2b[t] != -1:
                t2_idx.append(a2b[t])
            # else use multi outputs
            else:
                # hacky implementation
                # Basically, if the previous word is aligned,
                # and the next word as well, assign current words
                # to the difference of the two
                if t != len(tok1) - 1:
                    if a2b[t-1] != -1 and a2b[t+1] != -1:
                        t2_idx.append(
                            [x for x in range(a2b[t-1] + 1, a2b[t+1])])
                elif a2b[t-1] != -1:
                    t2_idx.append(
                        [x for x in range(a2b[t-1]+1, len(tok2))])
                else:
                    # Currently seems to work,
                    # set_trace to see when it doesn't work
                    import pdb
                    pdb.set_trace()
                    pass
        tok2_idx.append(t2_idx)

    return tok2_idx
Пример #5
0
def test_file(file_number):
    filename='books_wiki_en_corpus_'+'training_'+str(file_number)+'.txt'
    file=os.path.join(base_dir,filename)

    total_diffs=0
    with open(file) as f:
        text=f.readlines()

        for line in text:
            
            # spacy_doc=nlp(line.lower())
            # spacy_tokens=[str(token) for token in spacy_doc]

            bert_tokens=tokenizer_bert.tokenize(line)
            doc=Doc(nlp.vocab,words=bert_tokens)
            spacy_tokens=[t.text for t in doc]
            
            diff=align(bert_tokens,spacy_tokens)[0]

            if diff!=0:
                print('Difference of '+str(diff)+' positions at line : '+line+'\n')
                total_diffs+=1
    
    print('Total number of files with differences : {}'.format(total_diffs))
Пример #6
0
def test_align(tokens_a, tokens_b, expected):
    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)
    assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected
    # check symmetry
    cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)
    assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected
Пример #7
0
def main(args):
    import csv

    with open(args.data_path) as f:
        data = json.load(f)

    judgement_diffs = []

    N = args.num_grams
    user_kept_pos_ngram_counter = collections.Counter()
    all_gen_pos_ngram_counter = collections.Counter()
    pos_textlist_dict = collections.defaultdict(list)

    gen_len = 0
    gen_ent_len = 0
    interest_len = 0
    interest_ent_len = 0
    num_user_interest = 0
    num_interested = 0
    num_corner_case = 0

    num_uniq_gen_entities = 0
    num_uniq_interest_entities = 0
    num_uniq_gen_predefined_entities = 0
    num_uniq_gen_predefined_entities_hard_match = 0
    num_uniq_interest_predefined_entities = 0

    ent_csvf = open(os.path.join(
        args.output_dir, f'Entity_extraction_{args.type_interested}.csv'),
                    mode='w')
    ent_writer = csv.writer(ent_csvf, delimiter='\t')
    ent_writer.writerow([
        'Generated entities', 'Predefined characters',
        'Generated entities that are from the predefined character list',
        'Generated entities that are from the predefined character list by hard match'
    ])

    for data_idx, item in enumerate(data):
        # copy the existing content, a dict of 4 fields
        new_dict = item
        story = new_dict['story']
        predefined_characters = obtain_predefined_entities(story)
        generated = item['generated']['description']
        finalized = item['finalized']['description']
        gen_doc = nlp(generated)
        gen_spacy_tokens_unprep = [token.text for token in gen_doc]
        gen_spacy_tokens = process_spacy_tokens_for_align(
            gen_spacy_tokens_unprep)

        gen_prep = rouge._preprocess_summary_as_a_whole(generated)
        gen_prep_zero = gen_prep[0]
        gen_rouge_tokens = gen_prep_zero.split()

        cost, s2r, r2s, s2r_multi, r2s_multi = align(gen_spacy_tokens,
                                                     gen_rouge_tokens)

        new_dict['diffs'] = get_diff_score(generated, finalized)
        judgement_diffs.append(new_dict)

        gen_pos = [token.pos_ for token in gen_doc]
        all_gen_pos_ngram_counter.update(ngrams(gen_pos, N))

        ents = gen_doc.ents
        gen_ent_ids = []
        for ent in ents:
            ent_ids = list(range(ent.start, ent.end))
            gen_ent_ids.extend(ent_ids)
        gen_len += len(gen_pos)
        gen_ent_len += len(gen_ent_ids)

        gen_ent_str_list = []
        gen_ent_str_predefined_list = []
        gen_ent_str_predefined_list_hard_match = []
        for ent in ents:  # number of uniq entities
            ent_lc = ent.text.lower()
            gen_ent_str_list.append(ent_lc)
            if ent_lc in predefined_characters:
                gen_ent_str_predefined_list_hard_match.append(ent_lc)

            if contained_in(ent_lc, predefined_characters):
                gen_ent_str_predefined_list.append(ent_lc)

        print('Generated entities')
        print(' | '.join(gen_ent_str_list))
        print('Predefined characters: ')
        print(' | '.join(predefined_characters))
        print(
            'Generated entities that are kept in the predefined character list: '
        )
        print(' | '.join(gen_ent_str_predefined_list))
        print('\n\n')
        ent_writer.writerow([
            ' | '.join(gen_ent_str_list), ' | '.join(predefined_characters),
            ' | '.join(gen_ent_str_predefined_list),
            ' | '.join(gen_ent_str_predefined_list_hard_match)
        ])

        gen_ent_str_int_set = set()
        gen_ent_str_int_prefefined_set = set()

        for diff_results in new_dict['diffs']:
            if diff_results['type'] == args.type_interested:
                num_user_interest += 1

                [start_r, end_r] = diff_results['span_hypothesis']
                content = diff_results['content']
                content_rouge = gen_rouge_tokens[start_r:end_r]

                if end_r == len(
                        r2s
                ):  # this is a corner case because end_r is not actually the ending index of a span but that + 1
                    start_s, end_s = r2s[start_r], r2s[end_r -
                                                       1] + 1  # bug causing
                else:
                    start_s, end_s = r2s[start_r], r2s[end_r]  # bug causing

                content_spacy = gen_spacy_tokens[start_s:end_s]

                if content_spacy == []:
                    # print('handle corner case')
                    # print(content)
                    # print(content_rouge)
                    start_s = end_s - 1
                    content_spacy = gen_spacy_tokens[start_s:end_s]
                    # print(content_spacy)
                    # print('\n\n')
                    # num_corner_case += 1

                # print( ' | '.join(content_spacy))

                cur_ngrams_kept = list(ngrams(
                    content_spacy, N))  # a list of n grams of actual text
                pos_tokens = gen_pos[start_s:end_s]
                cur_ngrams_pos = list(ngrams(pos_tokens,
                                             N))  # a list of n grams of pos
                user_kept_pos_ngram_counter.update(
                    cur_ngrams_pos)  # update the count of POS ngrams

                assert len(content_spacy) == len(pos_tokens)
                assert len(cur_ngrams_kept) == len(cur_ngrams_pos)

                # update the dictionary (str, list): (pos_ngram, list of actual ngrams as examples)
                for pos_ngram, text_ngram in zip(cur_ngrams_pos,
                                                 cur_ngrams_kept):
                    pos_ngram_str = ' '.join(pos_ngram)
                    text_ngram_str = ' '.join(text_ngram)
                    pos_textlist_dict[pos_ngram_str].append(text_ngram_str)

                interest_ids = list(range(start_s, end_s))
                entity_interest_ids = [
                    id for id in interest_ids if id in gen_ent_ids
                ]
                interest_ent_len += len(entity_interest_ids)
                interest_len += end_s - start_s

                # check whether the generated entities appear in diff
                for entity in gen_ent_str_list:
                    if entity in ' '.join(content_spacy):
                        gen_ent_str_int_set.add(entity)

                        # check whether an entity in this type of text is present in the set of predefined characters
                        if contained_in(entity, predefined_characters):
                            gen_ent_str_int_prefefined_set.add(entity)
                            print(entity)
                        # else:
                        #   # print(entity)

        num_uniq_gen_entities += len(gen_ent_str_list)
        num_uniq_interest_entities += len(gen_ent_str_int_set)
        num_uniq_gen_predefined_entities += len(gen_ent_str_predefined_list)
        num_uniq_gen_predefined_entities_hard_match += len(
            gen_ent_str_predefined_list_hard_match)
        num_uniq_interest_predefined_entities += len(
            gen_ent_str_int_prefefined_set)
        # print(gen_ent_str_list)
        # print(gen_ent_str_int_set)
        # print()

    ent_csvf.close()

    print(f'analyzed {data_idx + 1} user judgements')

    print(f'Out of {num_interested} interested diff')
    print(f'there are {num_corner_case} corner cases')

    # convert a counter to a list of tuple, changing its keys from tuple to str
    user_kept_pos_ngram_counter_strkey = collections.Counter(
        dict([(' '.join(k), v)
              for k, v in user_kept_pos_ngram_counter.items()]))
    all_gen_pos_ngram_counter_strkey = collections.Counter(
        dict([(' '.join(k), v) for k, v in all_gen_pos_ngram_counter.items()]))

    # traverse over the pos ngrams user kept, compute their ratio
    print('traverse through the pos ngrams user kept, compute their ratio')
    kept_gen_pos_ngram_ratio_dict = {}
    for k, v in user_kept_pos_ngram_counter_strkey.most_common(50):
        kept_gen_pos_ngram_ratio_dict[k] = user_kept_pos_ngram_counter_strkey[
            k] / all_gen_pos_ngram_counter_strkey[k]
        # if all_gen_pos_ngram_counter_strkey[k] ==0:
        #   print('error')

    items = kept_gen_pos_ngram_ratio_dict.items()
    sorted_items = sorted(items,
                          key=lambda key_value: key_value[1],
                          reverse=True)
    import csv
    # write the pos ngrams analysis results into google spread sheet
    with open(os.path.join(args.output_dir,
                           f'{N}_grams_pos_{args.type_interested}.csv'),
              mode='w') as csvf:
        fieldnames = [
            'pos pattern', 'ratio (NOK / NOG)',
            'Number of Occurrences in user-Kept text (NOK)',
            'Number of Occurrences in Generated text (NOG)',
            'Examples in user-kept text'
        ]
        writer = csv.writer(csvf, delimiter='\t')
        writer.writerow(fieldnames)

        for idx in range(min([50, len(sorted_items)])):
            pos, ratio = sorted_items[idx]
            ck = user_kept_pos_ngram_counter_strkey[pos]
            cg = all_gen_pos_ngram_counter_strkey[pos]
            writer.writerow([
                pos, ratio, ck, cg,
                '  ||  '.join(random.sample(pos_textlist_dict[pos], 5))
            ])
    print('Saved: ')
    print(
        os.path.join(args.output_dir,
                     f'{N}_grams_pos_{args.type_interested}.csv'))
    print(f'gen len: {gen_len}')
    print(f'gen ent len: {gen_ent_len}')

    print(f'{args.type_interested} len: {interest_len}')
    print(f'{args.type_interested} ent len: {interest_ent_len}')

    print(
        f'in generated text the percentage of entity is {gen_ent_len / gen_len}'
    )
    print(
        f'in {args.type_interested} text the percentage of entity is {interest_ent_len / interest_len}'
    )

    print(f'number of user {args.type_interested} edits: {num_user_interest}')
    print(
        f'average user {args.type_interested} edit len: {interest_len / num_user_interest}'
    )

    print(f'Number of entities generated: {num_uniq_gen_entities}')
    print(
        f'Number of entities generated that are from predefined character list: {num_uniq_gen_predefined_entities}'
    )

    print(
        f'Number of entities generated that are from predefined character (by hard match) list: {num_uniq_gen_predefined_entities_hard_match}'
    )
    print(
        f'Number of entities in {args.type_interested}: {num_uniq_interest_entities}'
    )
    print(
        f'Number of entities in {args.type_interested} that are from predefined character list:{num_uniq_interest_predefined_entities}'
    )

    print('End of main')
Пример #8
0
    def parse(self, document_name: str,
              sentences: Iterable[Sentence]) -> Iterator[Sentence]:
        """Parse visual information embedded in sentence's html_attrs.

        :param document_name: the document name.
        :param sentences: sentences to be linked with visual information.
        :return: A generator of ``Sentence``.
        """
        def attrib_parse(
            html_attrs: List[str], ) -> Dict[str, Union[List[int], List[str]]]:
            ret: Dict[str, Union[List[int], List[str]]] = {}

            for attr in html_attrs:
                key, values = attr.split(
                    "=", 1)  # split only at the first occurence
                if key in ["left", "top", "right", "bottom", "ppageno"]:
                    ret[key] = [int(x) for x in values.split()]
                elif key == "tokens":
                    # Run RegEx replacements
                    for (rgx, replace) in self.replacements:
                        values = rgx.sub(replace, values)
                    ret[key] = values.split()
            return ret

        for _, group in itertools.groupby(sentences, key=lambda x: x.xpath):
            sents = list(group)

            # Get bbox from document
            attribs = attrib_parse(sents[0].html_attrs)
            lefts = attribs["left"]
            tops = attribs["top"]
            rights = attribs["right"]
            bottoms = attribs["bottom"]
            ppagenos = attribs["ppageno"]

            # Clear the hocr specific html_attrs
            for sent in sents:
                for attr in sent.html_attrs[:]:
                    key, values = attr.split(
                        "=", 1)  # split only at the first occurence
                    if key in [
                            "left",
                            "top",
                            "right",
                            "bottom",
                            "ppageno",
                            "tokens",
                            "x_wconf",
                    ]:
                        sent.html_attrs.remove(attr)

            # Get a list of all tokens represented by ocrx_word in hOCR
            hocr_tokens = attribs["tokens"]

            # Get a list of all tokens tokenized by spaCy.
            spacy_tokens = [word for sent in sents for word in sent.words]

            # gold.align assumes that both tokenizations add up to the same string.
            cost, h2s, s2h, h2s_multi, s2h_multi = align(
                hocr_tokens, spacy_tokens)

            ptr = 0  # word pointer
            for sent in sents:
                sent.left = []
                sent.top = []
                sent.right = []
                sent.bottom = []
                sent.page = []
                for i, word in enumerate(sent.words):
                    # One-to-one mapping is NOT available
                    if s2h[ptr + i] == -1:
                        if ptr + i in s2h_multi:  # One spacy token-to-multi hOCR words
                            left = lefts[s2h_multi[ptr + i]]
                            top = tops[s2h_multi[ptr + i]]
                            right = rights[s2h_multi[ptr + i]]
                            bottom = bottoms[s2h_multi[ptr + i]]
                            ppageno = ppagenos[s2h_multi[ptr + i]]
                        else:
                            h2s_multi_idx = [
                                k for k, v in h2s_multi.items() if ptr + i == v
                            ]
                            start, end = 0, 0
                            if h2s_multi_idx:  # One hOCR word-to-multi spacy tokens
                                start = h2s_multi_idx[0]
                                end = h2s_multi_idx[-1] + 1
                            else:
                                start = s2h_multi[i - 1 if i > 0 else 0]
                                end = s2h_multi[i + 1] + 1
                            # calculate a bbox that can include all
                            left = min(lefts[start:end])
                            top = min(tops[start:end])
                            right = max(rights[start:end])
                            bottom = max(bottoms[start:end])
                            ppageno = ppagenos[start]
                    # One-to-one mapping is available
                    else:
                        left = lefts[s2h[ptr + i]]
                        top = tops[s2h[ptr + i]]
                        right = rights[s2h[ptr + i]]
                        bottom = bottoms[s2h[ptr + i]]
                        ppageno = ppagenos[s2h[ptr + i]]
                    sent.left.append(left)
                    sent.top.append(top)
                    sent.right.append(right)
                    sent.bottom.append(bottom)
                    sent.page.append(ppageno + 1)  # 1-based in Fonduer
                ptr += len(sent.words)
                yield sent
def link_spans():
    """Match spans with verbs and assign data.

    The primary challenge is that there is no easy
    way to link the Spacy-parsed doc data with the
    word data we have for GBI. Remember that the full
    verse texts are compiled from the word lists and 
    then parsed as a full verse. The challenge is to
    use the indexing from before the word lists were
    compiled to match with the indexing of the Spacy
    doc object. That then needs to be cross-referenced
    with the Spacy Matcher object.
    """

    ts.indent(0, reset=True)
    ts.info('matching spans...')
            
    inspect = InspectionDoc()
    bhsa2eng = collections.defaultdict(dict)

    for trans, bhsa_nodes in english_verbs.items():
        
        for bhsa_node, para_words in bhsa_nodes.items():

            # get GBI-side data
            verse_ref = id2ref(para_words[0], 'translation')        
            para_text = ' '.join(word_data[trans][w]['text'] for w in para_words)
            verse_words = verse2words[trans][verse_ref]
            verse_tokens = [word_data[trans][w]['text'] for w in verse_words]
            verse_tokens = [t.replace(';','.') for t in verse_tokens]
            
            # get Spacy-side data
            verse_parsing = parsed_verses[trans][verse_ref]
            spacy_tokens = [str(t) for t in verse_parsing]
            
            # map Spacy tokens back to GBI tokens using indicies
            # Spacy tokenizes words with apostrophes differently (for e.g. `he'll` == `he` + `'ll`)
            # They can be re-aligned: https://spacy.io/usage/linguistic-features#aligning-tokenization
            cost, a2b, b2a, a2b_multi, b2a_multi = align(spacy_tokens, verse_tokens) # alignment of indicies here
            aligner = lambda i: a2b_multi.get(i, a2b[i]) # returns 1-to-1 or many-to-1 aligned index
            
            # try to retrieve span links with advanced tense tags
            verse_sents = list(verse_parsing.sents)
            spans = verse2spans[trans].get(verse_ref, [])
            span_match = trans_to_span(para_words, spans, verse_words, aligner) or '' # search for overlapping GBI id sets
            if span_match:
                tense_tag = span_match._.tense_tag
                sentence_i = verse_sents.index(span_match[-1].sent) 
            else:
                tense_tag = ''
                sentence_i = None
            
            # retrieve basic parsings
            raw_tokens = []
            for i, token in enumerate(verse_parsing):
                if verse_words[aligner(i)] in para_words:
                    raw_tokens.append(token)
                    
            vb_tokens = [t for t in raw_tokens if t.tag_.startswith('VB')]

           
                
            # save the data
            data = {
                'eng_ref': verse_ref,
                'words': para_text,
                'tags': '|'.join(t.tag_ for t in raw_tokens),
                'vb_tags': '|'.join(t.tag_ for t in vb_tokens),
                'tense': tense_tag,
                'tense_span': f'{span_match}',
                'sentence_i': sentence_i,
            }
            
            bhsa2eng[trans][bhsa_node] = data
                
            # add strings to inspection file
            if span_match and span_match._.tense_tag:
                inspect.data[trans][verse_ref] += f'\t\tMATCH: {bhsa_node}|{tense_tag}|{span_match}|{para_text}\n'
            else:
                inspect.data[trans][verse_ref] += f'\t\tMISS: {bhsa_node}|''|''|{para_text}\n'

                
    ts.info('done with matches')
    return (bhsa2eng, inspect)
Пример #10
0
    def __init__(self, gold, pred, verbose=False, group=False):
        """
        Align golden and predicted tokens, and their tags. Create dictionaries of falsely predicted tags
        :param gold:  the gold conllu file
        :param pred: the predicted conlly file
        :param verbose: if true print information about token numbers
        :param group: if true, put falsely predicted ufeats labels into a dictionary that contains all the labels it was
        falsely assigned and the number of times each predicted label was found
        """

        gold = C.load_conll(open(gold,
                                 'r', encoding='utf8'))
        gold_dic = C.convert_conll(gold)  # returns a dictionary with all the column names
        gold_doc = Document(gold_dic)

        pred = C.load_conll(open(pred, 'r', encoding='utf8'))
        pred_dic = C.convert_conll(pred)  # returns a dictionary with all the column names
        pred_doc = Document(pred_dic)

        # get the tokens
        self.gold_tokens = [j['text'] for i in gold_dic for j in i]
        self.pred_tokens = [j['text'] for i in pred_dic for j in i]

        # get upos tags
        gold_tags = [j['upos'] for i in gold_dic for j in i]
        pred_tags = [j['upos'] for i in pred_dic for j in i]

        # get xpos tags
        gold_xpos = [j['xpos'] for i in gold_dic for j in i]
        pred_xpos = [j['xpos'] for i in pred_dic for j in i]

        # get ufeats tag
        gold_feats = list()
        pred_feats = list()
        for i in gold_dic:
            for j in i:
                if 'feats' in j:
                    gold_feats.append(j['feats'])
                else:
                    gold_feats.append('_')
        for i in pred_dic:
            for j in i:
                if 'feats' in j:
                    pred_feats.append(j['feats'])
                else:
                    pred_feats.append('_')

        if verbose:
            print('Number of gold tokens:', len(self.gold_tokens), ', number of predicted tokens:', len(self.pred_tokens))

        # align gold and predicted tokens
        cost, a2b, b2a, a2b_multi, b2a_multi = align(self.gold_tokens, self.pred_tokens)

        # align tokens and their POS tags separately
        self.aligned = list()  # tokens
        self.aligned_pos = list()  # upos
        self.aligned_feats = list()
        self.aligned_xpos = list()
        for i in range(len(b2a)):
            t = (self.gold_tokens[b2a[i]], self.pred_tokens[i])
            self.aligned.append(t)
            p = (gold_tags[b2a[i]], pred_tags[i])
            self.aligned_pos.append(p)
            f = (gold_feats[b2a[i]], pred_feats[i])
            self.aligned_feats.append(f)
            x = (gold_xpos[b2a[i]], pred_xpos[i])
            self.aligned_xpos.append(x)

        # align predicted tags to golden tags, not vice versa as before
        gold_aligned = list()
        for i in range(len(a2b)):
            t = (self.gold_tokens[i], self.pred_tokens[a2b[i]])
            gold_aligned.append(t)

        overall = list()
        for (a, b) in self.aligned:
            if a == b:
                overall.append((a, b))
        if verbose:
            print('Aligned tokens. GOLD:', len(gold_aligned), 'PREDICTED:', len(self.aligned), 'ALIGNED:', len(overall))

        self.conf_tags = {} # falsely predicted upos tags
        self.conf_tags_all = {}  # all upos tags
        self.incorrect_upos = 0  # number of incorrectly predicted upos tags
        # how many times different tags cooccured in gold and pred files
        i = 0
        for (a, b) in self.aligned_pos:
            if a != b:
                self.incorrect_upos += 1
                if (a, b) not in self.conf_tags:
                    self.conf_tags[(a, b)] = 1
                else:
                    self.conf_tags[(a, b)] += 1
            if (a, b) not in self.conf_tags_all:
                self.conf_tags_all[(a, b)] = 1
            else:
                self.conf_tags_all[(a, b)] += 1
            i += 1

        self.conf_feats = {}
        self.conf_feats_all = {}
        self.incorrect_feats = 0
        i = 0
        for (a, b) in self.aligned_feats:
            a = "|".join(sorted(feat for feat in a.split("|")
                                if feat.split("=", 1)[0] in UNIVERSAL_FEATURES))
            b = "|".join(sorted(feat for feat in b.split("|")
                                if feat.split("=", 1)[0] in UNIVERSAL_FEATURES))
            if a != b:
                self.incorrect_feats += 1
                # create a dictionary for each falsely predicted ufeats labels and group all its false predictions
                if group:
                    if a not in self.conf_feats:
                        self.conf_feats[a] = dict()
                        self.conf_feats[a][b] = 1
                    else:
                        if b not in self.conf_feats[a]:
                            self.conf_feats[a][b] = 1
                        else:
                            self.conf_feats[a][b] += 1
                else:
                    if (a, b) not in self.conf_feats:
                        self.conf_feats[(a, b)] = 1
                    else:
                        self.conf_feats[(a, b)] += 1
            if (a, b) not in self.conf_feats_all:
                self.conf_feats_all[(a, b)] = 1
            else:
                self.conf_feats_all[(a, b)] += 1
            i += 1

        self.conf_xpos = {}
        self.incorrect_xpos = 0
        i = 0
        for (a, b) in self.aligned_xpos:
            if a != b:
                self.incorrect_xpos += 1
                if (a, b) not in self.conf_xpos:
                    self.conf_xpos[(a, b)] = 1
                else:
                    self.conf_xpos[(a, b)] += 1
            i += 1