Exemplo n.º 1
0
def load_json(p, lower):
    source = []
    tgt = []
    flag = False
    for sent in json.load(open(p, encoding='utf-8'))['sentences']: # SH geändert for sent in json.load(open(p))['sentences']:
        tokens = [t['word'] for t in sent['tokens']]
        if (lower):
            tokens = [t.lower() for t in tokens]
        if (tokens[0] == '@highlight') or (tokens[0] == '@songtitle'):
            flag = True
            # tgt.append(tokens[1:]) # sh hinzugefügt
            continue
        if (flag):
            tgt.append(tokens)
            # flag = False
        else:
            if '@songtitle' not in tokens:  # sh hinzugefügt to avoid empty tgts
                source.append(tokens)
            else:
                ind = tokens.index('@songtitle')
                source.append(tokens[:ind-1])
                flag = True
                title = tokens[ind+1:]
                tgt.append(title)

    source = [clean(' '.join(sent)).split() for sent in source]
    tgt = [clean(' '.join(sent)).split() for sent in tgt]
    return source, tgt
def load_json(p, lower):
    source = []
    tgt = []
    flag = False
    for sent in json.load(open(p))['sentences']:
        tokens = [t['word'] for t in sent['tokens']]
        if (lower):
            tokens = [
                t.lower() for t in tokens if t.lower() is not '\n'
                and t.lower() is not '\r' and t.lower() is not ' '
            ]
        if len(tokens) == 0:
            continue
        if (tokens[0] == '@highlight'):
            flag = True
            continue
        if (flag):
            tgt.append(tokens)
            # flag = False
        else:
            source.append(tokens)

    source = [clean(' '.join(sent)).split() for sent in source]
    tgt = [clean(' '.join(sent)).split() for sent in tgt]
    return source, tgt
Exemplo n.º 3
0
def vn_format_to_json(args):
    stories_dir = os.path.abspath(args.raw_path)
    tokenized_stories_dir = os.path.abspath(args.save_path)

    print("Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir))
    stories = glob.glob(pjoin(args.raw_path, '*.txt'))
    annotator = VnCoreNLP("./vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

    dataset = []
    for s in stories:
        tgt = []
        source = []
        flag = False
        f = open(pjoin(stories_dir, s), encoding='utf-8')
        for line in f:
            if line == '\n':
                continue
            if line == '@highlight\n':
                flag = True
                continue
            tokens = annotator.tokenize(line)
            if flag:
                tgt.extend(tokens)
            else:
                source = tokens
        dataset.append({"src": [clean(' '.join(sent)).split() for sent in source],
                        "tgt": [clean(' '.join(sent)).split() for sent in tgt]})

    print("Tokenizing %i files in %s" % (len(stories), stories_dir))
    print("VNCoreNLP Tokenizer has finished.")

    valid_test_ratio = 0.1
    all_size = len(dataset)
    test_sets = dataset[:int(all_size * valid_test_ratio)]
    valid_sets = dataset[int(all_size * valid_test_ratio):int(all_size * valid_test_ratio * 2)]
    train_sets = dataset[int(all_size * valid_test_ratio * 2):]
    corpora = {'train': train_sets, 'valid': valid_sets, 'test': test_sets}
    for corpus_type in ['train', 'valid', 'test']:
        p_ct = 0
        for split in [corpora[corpus_type][i * args.shard_size:(i + 1) * args.shard_size] for i in range((len(corpora[corpus_type]) + args.shard_size - 1) // args.shard_size)]:
            pt_file = pjoin(args.save_path, corpus_type + '.' + str(p_ct) + '.json')
            with codecs.open(pt_file, 'w', encoding='utf-8') as save:
                json.dump(split, save, ensure_ascii=False)
            p_ct += 1
Exemplo n.º 4
0
def load_json(p, lower):
    source = []
    tgt = []
    flag = False
    for sent in json.load(open(p, encoding='utf-8'))['sentences']:
        tokens = [t['word'] for t in sent['tokens']]
        if (lower):
            tokens = [t.lower() for t in tokens]
        if (tokens[0] == '@highlight'):
            flag = True
            continue
        if (flag):
            tgt.append(tokens)
            flag = False
        else:
            source.append(tokens)

    source = [clean(' '.join(sent)).split() for sent in source]
    tgt = [clean(' '.join(sent)).split() for sent in tgt]
    return source, tgt
Exemplo n.º 5
0
def load_json(p, lower):
    source = []
    tgt = []
    flag = False
    for sent in json.load(open(p))["sentences"]:
        tokens = [t["word"] for t in sent["tokens"]]
        if lower:
            tokens = [t.lower() for t in tokens]
        if tokens[0] == "@highlight":
            flag = True
            tgt.append([])
            continue
        if flag:
            tgt[-1].extend(tokens)
        else:
            source.append(tokens)

    source = [clean(" ".join(sent)).split() for sent in source]
    tgt = [clean(" ".join(sent)).split() for sent in tgt]
    return source, tgt
Exemplo n.º 6
0
def load_json(p, lower):
    source = []
    tgt = []
    flag = False
    for sent in json.load(open(p))['sentences']:
        tokens = [t['word'] for t in sent['tokens']]
        if lower:
            tokens = [t.lower() for t in tokens]
        if tokens[0] == '@highlight':
            flag = True
            tgt.append([])
            continue
        if flag:
            tgt[-1].extend(tokens)
        else:
            source.append(tokens)

    source = [clean(' '.join(sent)).split() for sent in source]
    tgt = [clean(' '.join(sent)).split() for sent in tgt]
    return source, tgt
Exemplo n.º 7
0
def load_json(p, lower):
    source = []
    tgt = []
    flag = False
    for sent in json.load(open(p))['sentences']:
        tokens = [t['word'] for t in sent['tokens']]
        if (lower):
            tokens = [t.lower() for t in tokens]
        if (tokens[0] == '@highlight'):
            flag = True
            continue
        if (flag):
            tgt.append(tokens)
            #commented because in CNN dataset there's a '@highlight' after each summary, where as I only put only one. 
            #flag = False 
        else:
            source.append(tokens)

    source = [clean(' '.join(sent)).split() for sent in source]
    tgt = [clean(' '.join(sent)).split() for sent in tgt]
    return source, tgt
def load_json(p, lower=False):
    regex = r"([\w][.])+?[ ]([\w])"
    to = r"\1\n\2"

    examples = []
    flag = False

    for data in tqdm.tqdm(json.load(open(p))):
        if not (data['type'] == "video"):
            if len(data["content"]) == 0 or len(data['summary']) == 0:
                continue
            src_sentences = re.sub(regex, to,
                                   data["content"].strip()).split("\n")
            tgt_sentences = re.sub(regex, to,
                                   data['summary'].strip()).split("\n")

            src_tokens = [clean(sent).split(' ') for sent in src_sentences]
            tgt_tokens = [clean(sent).split(' ') for sent in tgt_sentences]

            examples.append({"src": src_tokens, "tgt": tgt_tokens})

    return examples
Exemplo n.º 9
0
def load_jsonwiki(p, lower):   
    source = []
    tgt = []
    flag = False
    for sent in json.load(open(p))['sentences']:
        print(sent)
        tokens = [t['word'] for t in sent['tokens']]
        print("lower:", lower)
        if (lower):
            tokens = [t.lower() for t in tokens]
            print("tokens are now this:", tokens)

        if (tokens[0] == '@summary'):
            flag = True

            tgt.append([])


        if (tokens[0] == '@article'):
            flag = False

            source.append(tokens[1:])
            continue

        if (flag):
            print("tokens:", tokens)
            if (tokens[0] != '@summary'):
                tgt[-1].extend(tokens)
            else:
                tgt[-1].extend(tokens[1:])

        else:
            source.append(tokens)

    source = [clean(' '.join(sent)).split() for sent in source]
    tgt = [clean(' '.join(sent)).split() for sent in tgt]
    return source, tgt
Exemplo n.º 10
0
def load_json(p, lower):
    """ 
    Function to load json and create dataset
    Input: text as train;  highlight as target
    """
    source = []
    tgt = []
    flag = False
    for sent in json.load(open(p))['sentences']:
        tokens = [t['word'] for t in sent['tokens']]
        if (lower):
            tokens = [t.lower() for t in tokens]
        if (tokens[0] == '@highlight'):
            flag = True
            tgt.append([])
            continue
        if (flag):
            tgt[-1].extend(tokens)
        else:
            source.append(tokens)

    source = [clean(' '.join(sent)).split() for sent in source]
    tgt = [clean(' '.join(sent)).split() for sent in tgt]
    return source, tgt
Exemplo n.º 11
0
def load_json(p, lower):
    source = []
    tgt = []
    flag = False
    # pos_tag = []
    for sent in json.load(open(p))['sentences']:
        tokens = [t['word'] for t in sent['tokens']]
        # _pos_tag = [t['pos'] for t in sent['tokens']]
        if (lower):
            tokens = [t.lower() for t in tokens]
        if (tokens[0] == '@highlight'):
            flag = True
            continue
        if (flag):
            tgt.append(tokens)
            flag = False
        else:
            source.append(tokens)
            # pos_tag.append(_pos_tag)

    source = [clean(' '.join(sent)).split() for sent in source]
    tgt = [clean(' '.join(sent)).split() for sent in tgt]
    # assert len(' '.join([' '.join(i) for i in pos_tag]).split()) == len(' '.join([' '.join(i) for i in source]).split())
    return source, tgt