예제 #1
0
def run_model_targetid(text_lines, model_variables):
    """`text_input` is the document, which contains lines"""

    builders = model_variables['builders']
    model_dir = model_variables['model_dir']

    instances = []
    offsets = []  # contains array with for each row, the offsets of each token
    for i, line in enumerate(text_lines):
        instance, offset = make_data_instance(line, i, get_offsets=True)
        instances.append(instance)
        offsets.append(offset)
    # filter empty rows
    instances = [el for el in instances if el]

    with open('{}offsets.json'.format(model_dir), 'w') as f:
        json.dump(offsets, f)
    out_conll_file = "{}predicted-targets.conll".format(model_dir)

    predictions = []
    for instance in instances:
        _, prediction = identify_targets(builders, instance.tokens,
                                         instance.postags, instance.lemmas,
                                         model_variables)
        predictions.append(prediction)
    sys.stderr.write(
        "Printing output in CoNLL format to {}\n".format(out_conll_file))
    print_as_conll_targetid(instances, predictions, out_conll_file)
    sys.stderr.write("Done!\n")

    return offsets
def load_instances(filepath):
    print("Loading Instances from {}".format(filepath))
    with codecs.open(filepath, 'r', encoding='utf-8') as infile:
        lines = infile.readlines()
        instances = [
            make_data_instance(line, i) for i, line in enumerate(lines)
        ]
    return instances
if options.mode in ["train", "refresh"]:
    dev_examples, _, _ = read_conll(DEV_CONLL)
    combined_dev = combine_examples(dev_examples)
    out_conll_file = "{}predicted-{}-targetid-dev.conll".format(
        model_dir, VERSION)
elif options.mode == "test":
    dev_examples, m, t = read_conll(TEST_CONLL)
    combined_dev = combine_examples(dev_examples)
    out_conll_file = "{}predicted-{}-targetid-test.conll".format(
        model_dir, VERSION)
elif options.mode == "predict":
    assert options.raw_input is not None
    with open(options.raw_input, "rb") as fin:
        instances = [
            make_data_instance(line.decode(encoding='UTF-8', errors='strict'),
                               i) for i, line in enumerate(fin)
            if line is not '\n'
        ]
    filename = options.raw_input.split('/')[-1].split('.')[0]
    out_conll_file = "{}/lingfn_comparison/predicted-targets_{}.conll".format(
        model_dir, filename)
else:
    raise Exception("Invalid parser mode", options.mode)

# Default configurations.
configuration = {
    "train": train_conll,
    "unk_prob": 0.1,
    "dropout_rate": 0.01,
    "token_dim": 100,
    "pos_dim": 100,
예제 #4
0
UNKTOKEN = VOCDICT.getid(UNK)

if options.mode in ["train", "refresh"]:
    dev_examples, _, _ = read_conll(DEV_CONLL)
    combined_dev = combine_examples(dev_examples)
    out_conll_file = "{}predicted-{}-targetid-dev.conll".format(
        model_dir, VERSION)
elif options.mode == "test":
    dev_examples, m, t = read_conll(TEST_CONLL)
    combined_dev = combine_examples(dev_examples)
    out_conll_file = "{}predicted-{}-targetid-test.conll".format(
        model_dir, VERSION)
elif options.mode == "predict":
    assert options.raw_input is not None
    with open(options.raw_input, "r") as fin:
        instances = [make_data_instance(line, i) for i, line in enumerate(fin)]
    out_conll_file = "{}predicted-targets.conll".format(model_dir)
else:
    raise Exception("Invalid parser mode", options.mode)

# Default configurations.
configuration = {
    "train": train_conll,
    "unk_prob": 0.1,
    "dropout_rate": 0.01,
    "token_dim": 100,
    "pos_dim": 100,
    "lemma_dim": 100,
    "lstm_input_dim": 100,
    "lstm_dim": 100,
    "lstm_depth": 2,
예제 #5
0
def load_instances(story):
    lines = nltk.sent_tokenize(story)
    instances = [make_data_instance(line, i) for i, line in enumerate(lines)]
    return instances