示例#1
0
def read_system_training_data(filename):
    insts = []
    for inst in pd.read_csv(filename, index_col=None,
                            encoding='UTF-8').to_dict('records'):
        insts.append({
            'dataset':
            'E2E',
            'mr':
            DA.parse_diligent_da(inst['mr']).to_cambridge_da_string(),
            'delex_mr':
            DA.parse_diligent_da(inst['mr']).get_delexicalized(
                set(['name', 'near'])).to_cambridge_da_string(),
            'system':
            'HUMAN',
            'system_ref':
            None,
            'orig_ref':
            inst['ref'],
            'informativeness':
            None,
            'naturalness':
            None,
            'quality':
            None,
            'is_real':
            0
        })
    log_info(
        "Using %d different training human references to create fake pairs" %
        len(insts))
    return insts
示例#2
0
def process_file(tagger_model, input_file):
    detok = Detokenizer()
    df = pd.read_csv(input_file, sep="\t", encoding="UTF-8")
    raw_mrs = list(df['MR'])
    raw_refs = [detok.detokenize(text) for text in list(df['output'])]
    mrs = [DA.parse_diligent_da(mr) for mr in raw_mrs]
    tagger = MorphoTagger(tagger_model)
    tagged_refs = [tagger.tag(line) for line in raw_refs]

    for ff in ['ngram', 'lca', 'collins']:
        write_output(tagged_refs, ff,
                     re.sub(r'\.tsv', '.tag.%s.txt' % ff, input_file))

    stats = data_stats(mrs, tagged_refs, {
        'name': [],
        'near': []
    }, re.sub(r'\.tsv', '', input_file))
    return stats
示例#3
0
def convert(args):
    src = pd.read_csv(args.src_file, index_col=None, encoding='utf-8')
    df = pd.DataFrame(index=np.arange(len(src)), columns=COLUMNS)
    for src_col, trg_col in COLUMN_MAP.iteritems():
        if isinstance(trg_col, list):
            for trg_col_ in trg_col:
                df[trg_col_] = src[src_col]
        else:
            df[trg_col] = src[src_col]
    df['mr'] = [
        DA.parse_diligent_da(da).to_cambridge_da_string() for da in src['mr']
    ]
    df['is_real'] = np.ones(len(src), dtype=np.int32)
    df['dataset'] = ['INLG'] * len(src)
    df['system'] = ['human'] * len(src)
    df.to_csv(args.out_file,
              columns=COLUMNS,
              sep=b"\t",
              index=False,
              encoding='UTF-8')
示例#4
0
def convert(args):
    """Main function – read in the CSV data and output TGEN-specific files."""

    # find out which slots should be abstracted (from command-line argument)
    slots_to_abstract = set()
    if args.abstract is not None:
        slots_to_abstract.update(re.split(r'[, ]+', args.abstract))

    # initialize storage
    conc_das = []
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions

    # statistics about different DAs
    da_keys = {}
    insts = 0

    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)

        text, da, abst = delex_sent(da,
                                    tokenize(conc),
                                    slots_to_abstract,
                                    args.slot_names,
                                    repeated=True)
        text = text.lower().replace('x-',
                                    'X-')  # lowercase all but placeholders
        da.sort()

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        csvread = csv.reader(fh, encoding='UTF-8')
        csvread.next()  # skip header
        for mr, text, voice in csvread:
            da = DA.parse_diligent_da(mr, voice)
            process_instance(da, text)
            insts += 1

        print 'Processed', insts, 'instances.'
        print '%d different DAs.' % len(da_keys)
        print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) /
                                            float(len(das)))
        print 'Max DA len: %d, max text len: %d' % (max(
            [len(da)
             for da in das]), max([text.count(' ') + 1 for text in texts]))

    # for multi-ref mode, group by the same conc DA
    if args.multi_ref:
        groups = OrderedDict()
        for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts,
                                                 absts):
            group = groups.get(unicode(conc_da), {})
            group['da'] = da
            group['conc_da'] = conc_da
            group['abst'] = group.get('abst', []) + [abst]
            group['conc'] = group.get('conc', []) + [conc]
            group['text'] = group.get('text', []) + [text]
            groups[unicode(conc_da)] = group

        conc_das, das, concs, texts, absts = [], [], [], [], []
        for group in groups.itervalues():
            conc_das.append(group['conc_da'])
            das.append(group['da'])
            concs.append("\n".join(group['conc']) + "\n")
            texts.append("\n".join(group['text']) + "\n")
            absts.append("\n".join([
                "\t".join([unicode(a) for a in absts_])
                for absts_ in group['abst']
            ]) + "\n")
    else:
        # convert abstraction instruction to string (coordinate output with multi-ref mode)
        absts = ["\t".join([unicode(a) for a in absts_]) for absts_ in absts]

    with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh:
        for da in das:
            fh.write(unicode(da) + "\n")

    with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh:
        for conc_da in conc_das:
            fh.write(unicode(conc_da) + "\n")

    with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh:
        for conc in concs:
            fh.write(conc + "\n")

    with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh:
        for abst in absts:
            fh.write(abst + "\n")

    with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh:
        for text in texts:
            fh.write(text + "\n")
示例#5
0
文件: convert.py 项目: UFAL-DSG/tgen
def convert(args):
    """Main function – read in the CSV data and output TGEN-specific files."""

    # find out which slots should be abstracted (from command-line argument)
    slots_to_abstract = set()
    if args.abstract is not None:
        slots_to_abstract.update(re.split(r'[, ]+', args.abstract))

    # initialize storage
    conc_das = []
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions

    # statistics about different DAs
    da_keys = {}
    insts = 0

    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)

        text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True)
        text = text.lower().replace('x-', 'X-')  # lowercase all but placeholders
        da.sort()

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        csvread = csv.reader(fh, encoding='UTF-8')
        csvread.next()  # skip header
        for mr, text in csvread:
            da = DA.parse_diligent_da(mr)
            process_instance(da, text)
            insts += 1

        print 'Processed', insts, 'instances.'
        print '%d different DAs.' % len(da_keys)
        print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das)))
        print 'Max DA len: %d, max text len: %d' % (max([len(da) for da in das]),
                                                    max([text.count(' ') + 1 for text in texts]))

    # for multi-ref mode, group by the same conc DA
    if args.multi_ref:
        groups = OrderedDict()
        for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts, absts):
            group = groups.get(unicode(conc_da), {})
            group['da'] = da
            group['conc_da'] = conc_da
            group['abst'] = group.get('abst', []) + [abst]
            group['conc'] = group.get('conc', []) + [conc]
            group['text'] = group.get('text', []) + [text]
            groups[unicode(conc_da)] = group

        conc_das, das, concs, texts, absts = [], [], [], [], []
        for group in groups.itervalues():
            conc_das.append(group['conc_da'])
            das.append(group['da'])
            concs.append("\n".join(group['conc']) + "\n")
            texts.append("\n".join(group['text']) + "\n")
            absts.append("\n".join(["\t".join([unicode(a) for a in absts_])
                                    for absts_ in group['abst']]) + "\n")
    else:
        # convert abstraction instruction to string (coordinate output with multi-ref mode)
        absts = ["\t".join([unicode(a) for a in absts_]) for absts_ in absts]

    with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh:
        for da in das:
            fh.write(unicode(da) + "\n")

    with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh:
        for conc_da in conc_das:
            fh.write(unicode(conc_da) + "\n")

    with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh:
        for conc in concs:
            fh.write(conc + "\n")

    with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh:
        for abst in absts:
            fh.write(abst + "\n")

    with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh:
        for text in texts:
            fh.write(text + "\n")
示例#6
0
def convert(args):
    src = pd.read_csv(args.src_file, index_col=None, encoding='utf-8')
    data = []
    src_col = args.column
    trg_col = COLUMN_MAP[src_col[:3]]
    unique_mrs = set()

    for _, src_inst in src.iterrows():
        mr = DA.parse_diligent_da(src_inst['mr']).to_cambridge_da_string()
        delex_mr = DA.parse_diligent_da(src_inst['mr']).get_delexicalized(
            set(['name', 'near'])).to_cambridge_da_string()
        unique_mrs.add(delex_mr)
        syss = [{
            'sys': src_inst['sys%d' % i],
            'ref': src_inst['ref%d' % i],
            'val': src_inst['%s%d' % (src_col, i)]
        } for i in xrange(1, 6)]

        for sys1, sys2 in itertools.combinations(syss, 2):
            if sys1['val'] < sys2['val']:  # without loss of generality
                sys1, sys2 = sys2, sys1
            if sys1['val'] == sys2['val']:  # ignore those that are equal
                continue
            trg_inst = {
                'dataset': 'E2E',
                'system': SYSTEMS_MAP[sys1['sys']],
                'system2': SYSTEMS_MAP[sys2['sys']],
                'orig_ref': None,
                'mr': mr,
                'delex_mr': delex_mr,
                'system_ref': sys1['ref'],
                'system_ref2': sys2['ref'],
                'is_real': 1,
                'informativeness': None,
                'naturalness': None,
                'quality': None
            }
            trg_inst[trg_col] = 1
            data.append(trg_inst)

    unique_mrs = sorted(list(unique_mrs))
    random.shuffle(unique_mrs)

    part_sizes = [int(p) for p in args.ratio.split(':')]
    part_sizes = [
        int(round(p * len(unique_mrs) / float(sum(part_sizes))))
        for p in part_sizes
    ]
    part_sizes[0] = len(unique_mrs) - sum(part_sizes[1:])
    part_labels = args.labels.split(':')
    part_start = 0
    log_info('Data sizes in MRs: %s' % ':'.join([str(p) for p in part_sizes]))

    # remove ambiguous instances
    if args.unambiguous:
        occs = Counter([(inst['mr'], inst['system'], inst['system2'])
                        for inst in data])
        ambig = set()
        for mr, sys1, sys2 in occs.iterkeys():
            if occs.get((mr, sys2, sys1), 0) == occs[(mr, sys1, sys2)]:
                ambig.add((mr, sys1, sys2))

        uniq_data = []
        used_insts = set()
        for inst in data:
            mr, sys1, sys2 = inst['mr'], inst['system'], inst['system2']
            if (mr, sys1, sys2) in ambig or (mr, sys1, sys2) in used_insts:
                continue
            uniq_data.append(inst)
            used_insts.add((mr, sys1, sys2))
        data = uniq_data

    # mark down the configuration
    with codecs.open(os.path.join(args.out_path, 'config'),
                     'wb',
                     encoding='UTF-8') as fh:
        fh.write(pprint.pformat(vars(args), indent=4, width=100))

    # split the output
    for part_no, (part_size,
                  part_label) in enumerate(zip(part_sizes, part_labels)):
        part_mrs = set(unique_mrs[part_start:part_start + part_size])
        part_data = [inst for inst in data if inst['delex_mr'] in part_mrs]

        if args.shuffle:
            random.shuffle(part_data)

        part_df = pd.DataFrame(part_data)

        if part_no == 0 and args.fake_data:
            # create fake data
            indiv_sys_outputs = get_sys_outputs(part_data)
            if args.fake_data_from:
                indiv_sys_outputs.extend(
                    read_system_training_data(args.fake_data_from))
            fake_insts = create_fake_data(
                pd.DataFrame.from_records(indiv_sys_outputs),
                part_df.columns,
                score_type='rank')
            fake_pairs = create_fake_pairs(fake_insts, len(indiv_sys_outputs))
            part_df = part_df.append(fake_pairs, sort=True)

        out_file = os.path.join(args.out_path, part_label + '.tsv')
        log_info('File: %s, total size %d' % (out_file, len(part_df)))
        part_df.to_csv(out_file,
                       columns=COLUMNS,
                       sep=b"\t",
                       index=False,
                       encoding='UTF-8')

        part_start += part_size
示例#7
0
def parse_mr(mr_text):
    return DA.parse_diligent_da(mr_text).get_delexicalized(
        set(['name', 'near']))
示例#8
0
def read_e2e_data():
    with codecs.open('data/e2e-refs.tag.ngram.txt', 'r', 'UTF-8') as fh:
        refs = [split_tags(inst.strip()) for inst in fh.readlines()]
    with codecs.open('data/e2e-mrs.txt', 'r', 'UTF-8') as fh:
        mrs = [DA.parse_diligent_da(mr) for mr in fh.readlines()]
    return mrs, refs
示例#9
0
def convert(args):
    """Main function – read in the CSV data and output TGEN-specific files."""

    # find out which slots should be abstracted (from command-line argument)
    slots_to_abstract = set()
    if args.abstract is not None:
        slots_to_abstract.update(re.split(r'[, ]+', args.abstract))

    # initialize storage
    conc_das = []
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions

    # statistics about different DAs
    da_keys = {}
    insts = 0

    def process_instance(conc_da, conc):
        # sort the DA using the same order as in E2E NLG data
        conc_da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value))
        conc_das.append(conc_da)

        text, da, abst = delex_sent(conc_da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True)
        text = text.lower().replace('x-', 'X-')  # lowercase all but placeholders
        da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value))

        da_keys[str(da)] = da_keys.get(str(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

    # process the input data and store it in memory
    data = pd.read_csv(args.in_file, sep=',', encoding='UTF-8')
    data['mr'] = data['mr'].fillna('')
    for inst in data.itertuples():
        da = DA.parse_diligent_da(inst.mr)
        process_instance(da, inst.ref)
        insts += 1
        if insts % 100 == 0:
            print('%d...' % insts, end='', flush=True, file=sys.stderr)

    print('Processed', insts, 'instances.', file=sys.stderr)
    print('%d different DAs.' % len(da_keys), file=sys.stderr)
    print('%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))),
          file=sys.stderr)
    print('Max DA len: %d, max text len: %d' % (max([len(da) for da in das]),
                                                max([text.count(' ') + 1 for text in texts])),
          file=sys.stderr)

    # for multi-ref mode, group by the same conc DA
    if args.multi_ref:
        groups = OrderedDict()  # keep the original order (by 1st occurrence of DA)
        for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts, absts):
            group = groups.get(str(conc_da), {})
            group['da'] = da
            group['conc_da'] = conc_da
            group['abst'] = group.get('abst', []) + [abst]
            group['conc'] = group.get('conc', []) + [conc]
            group['text'] = group.get('text', []) + [text]
            groups[str(conc_da)] = group

        conc_das, das, concs, texts, absts = [], [], [], [], []
        for group in groups.values():
            conc_das.append(group['conc_da'])
            das.append(group['da'])
            concs.append("\n".join(group['conc']) + "\n")
            texts.append("\n".join(group['text']) + "\n")
            absts.append("\n".join(["\t".join([str(a) for a in absts_])
                                    for absts_ in group['abst']]) + "\n")
    else:
        # convert abstraction instruction to string (coordinate output with multi-ref mode)
        absts = ["\t".join([str(a) for a in absts_]) for absts_ in absts]

    with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh:
        for da in das:
            fh.write(str(da) + "\n")

    with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh:
        for conc_da in conc_das:
            fh.write(str(conc_da) + "\n")

    with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh:
        for conc in concs:
            fh.write(conc + "\n")

    with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh:
        for abst in absts:
            fh.write(abst + "\n")

    with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh:
        for text in texts:
            fh.write(text + "\n")
示例#10
0
def convert(args):
    """Main function – read in the CSV data and output TGEN-specific files."""

    # find out which slots should be abstracted (from command-line argument)
    slots_to_abstract = set()
    if args.abstract is not None:
        slots_to_abstract.update(re.split(r'[, ]+', args.abstract))

    # initialize storage
    conc_das = []
    das = []  # abstracted DAs
    concs = []  # concrete sentences
    texts = []  # abstracted sentences
    absts = []  # abstraction descriptions
    original_sents = []
    delexicalised_sents = []
    sent_ids = []
    mrs_for_delex = []

    # statistics about different DAs
    da_keys = {}
    insts = 0
    find_apostrophes = r"([a-z])\s('[a-z]{1,2}\b)"

    def process_instance(da, conc, mr, multi_ref_id):
        original_da = deepcopy(da)
        # why do the das need to be sorted? This seems weird
        # Anyway, we checked it gets sorted in delex_sent anyway so nothing to
        # do about it until later
        da.sort()
        conc_das.append(da)

        text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True)
        # Originall we didn't want to lower case because it will make things
        # easier for udpipe later on, however ...
        # we changed our mind on this because the upper case characters are
        # messing with udpipe's ability to properly sentence tokenize.
        # we need underscores instead of dashes or else udpipe breaks it apart
        # text = re.sub(r"X-", r"X_", text)
        # Again running into problems with leaving x and as a capital letter
        # and also with udpipe randomly segmenting it but sometimes not. We
        # really need to find a more reliable sentence tokenizer / word
        # tokenizer
        text = text.lower().replace('x-', 'x')
        # We're testing out making xnear upper case to see if it reduces the
        # incorrect dropping of it by the deep parser
        text = text.replace('xnear', 'Xnear')

        # detokenize some of the apostrophe stuff because udpipe does it
        # differently. Namely removing spaces between letters and apostrophes
        text = re.sub(find_apostrophes, r"\1\2", text)
        da.sort()

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

        # now for our own bastardized sentence tokenization and human eval
        # required stuff
        this_conc_sents = sent_tokenize(conc)
        num_sents = len(this_conc_sents)
        this_delex_sents = []
        for i, this_conc_sent in enumerate(this_conc_sents):
            text, _, _ = delex_sent(original_da, tokenize(this_conc_sent), slots_to_abstract, args.slot_names, repeated=True)
            text = text.lower().replace('x-', 'x')
            # We're testing out making xnear upper case to see if it reduces the
            # incorrect dropping of it by the deep parser
            text = text.replace('xnear', 'Xnear')
            # detokenize some of the apostrophe stuff because udpipe does it
            # differently. Namely removing spaces between letters and apostrophes
            text = re.sub(find_apostrophes, r"\1\2", text)
            this_delex_sents.append(text)

            # start appending the sentence specific ones
            sent_ids.append('_'.join([mr.replace(' ', ''), str(multi_ref_id),
                                      str(i)]))
            mrs_for_delex.append(mr)

        # now we're onto something else
        original_sents.append('\n'.join(this_conc_sents))
        delexicalised_sents.append('\n'.join(this_delex_sents))

        # this_delex_sents = sent_tokenize(text)
        # num_sents = len(this_conc_sents)
        # if num_sents != len(this_delex_sents):
        #     # this is very bad if this happens!
        #     # import ipdb; ipdb.set_trace()
        #     print '\n'
        #     print this_conc_sents
        #     print this_delex_sents
        #     print '\nnext example'

        # original_sents.append('\n'.join(this_conc_sents))
        # delexicalised_sents.append('\n'.join(this_delex_sents))
        # for i in range(num_sents):
        #     sent_ids.append('_'.join([mr.replace(' ', ''), str(multi_ref_id),
        #                               str(i)]))
        #     mrs_for_delex.append(mr)

    # process the input data and store it in memory
    with open(args.in_file, 'r') as fh:
        csvread = csv.reader(fh, encoding='UTF-8')
        csvread.next()  # skip header
        multi_ref_count = Counter()
        for mr, text in tqdm(csvread):
            multi_ref_count[mr] += 1
            da = DA.parse_diligent_da(mr)
            process_instance(da, text, mr, multi_ref_count[mr])
            insts += 1

        print 'Processed', insts, 'instances.'
        print '%d different DAs.' % len(da_keys)
        print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das)))
        print 'Max DA len: %d, max text len: %d' % (max([len(da) for da in das]),
                                                    max([text.count(' ') + 1 for text in texts]))

    # for multi-ref mode, group by the same conc DA
    if args.multi_ref:
        groups = OrderedDict()
        for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts, absts):
            group = groups.get(unicode(conc_da), {})
            group['da'] = da
            group['conc_da'] = conc_da
            group['abst'] = group.get('abst', []) + [abst]
            group['conc'] = group.get('conc', []) + [conc]
            group['text'] = group.get('text', []) + [text]
            groups[unicode(conc_da)] = group

        conc_das, das, concs, texts, absts = [], [], [], [], []
        for group in groups.itervalues():
            conc_das.append(group['conc_da'])
            das.append(group['da'])
            concs.append("\n".join(group['conc']) + "\n")
            texts.append("\n".join(group['text']) + "\n")
            absts.append("\n".join(["\t".join([unicode(a) for a in absts_])
                                    for absts_ in group['abst']]) + "\n")
    else:
        # convert abstraction instruction to string (coordinate output with multi-ref mode)
        absts = ["\t".join([unicode(a) for a in absts_]) for absts_ in absts]

    with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh:
        for da in das:
            fh.write(unicode(da) + "\n")

    with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh:
        for conc_da in conc_das:
            fh.write(unicode(conc_da) + "\n")

    with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh:
        for conc in concs:
            fh.write(conc + "\n")

    with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh:
        for abst in absts:
            fh.write(abst + "\n")

    # We join on double new lines so that udpipe will read them out as
    # different paragraphs
    with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh:
        for text in texts:
            fh.write(text + "\n\n")

    # here are all our new ones
    with codecs.open(args.out_name + '-orig_sents.txt', 'w', 'UTF-8') as fh:
        for this in original_sents:
            fh.write(this + "\n")

    # again gets a double new lines for processing with udpipe
    with codecs.open(args.out_name + '-delex_sents.txt', 'w', 'UTF-8') as fh:
        for this in delexicalised_sents:
            fh.write(this + "\n\n")

    with codecs.open(args.out_name + '-sent_ids.txt', 'w', 'UTF-8') as fh:
        for this in sent_ids:
            fh.write(this + "\n")

    with codecs.open(args.out_name + '-mrs_for_delex.txt', 'w', 'UTF-8') as fh:
        for this in mrs_for_delex:
            fh.write(this + "\n")