def init_database(db, input_path, exist_annotation, db_id, batch_size):
    with open(input_path, 'r') as infile:
        json_obj = json.load(infile)

    db_name = "WebNLG_" + str(db_id)
    dataset = Dataset(name=db_name)
    db.session.add(dataset)
    db.session.commit()

    example_id = 300
    for obj in json_obj:
        for i in range(3):
            if obj['ID'] in exist_annotation \
                and i in exist_annotation[obj['ID']]:
                continue
            if example_id > 300 and example_id % batch_size == 0:
                db_id += 1
                db_name = "WebNLG_" + str(db_id)
                dataset = Dataset(name=db_name)
                db.session.add(dataset)
                db.session.commit()
            example = Example(id=example_id,
                              dataset_id=dataset.id,
                              ex_id=obj['ID'],
                              tgt_id=i,
                              src_json=json.dumps(obj['SRC']),
                              tgt_json=json.dumps(obj['TGT-' + str(i)]),
                              sanity_check=json.dumps(obj['CHK-' + str(i)]))
            db.session.add(example)
            db.session.commit()
            example_id += 1
    return example_id
def test_name_too_short():
    """ Ensure that dataset names with length < 2 lead to validation error """
    min_length = 2
    Dataset(name="a" * min_length)
    for i in range(min_length):
        with pytest.raises(ValidationError):
            Dataset(name="a" * i)
示例#3
0
def one_split(db, idx, sanity_data):
    # Insert dataset
    dataset = Dataset(name="ALG_FACT"+str(idx))
    db.session.add(dataset)
    db.session.commit()

    summaries_path = os.path.join(dataset_path, 'summaries')
    documents_path = os.path.join(dataset_path, 'documents')
    for doc_id in sanity_data:
        file_name = doc_id + ".data"
        file_path = os.path.join(documents_path, file_name)
        summ_path = os.path.join(summaries_path, file_name)
        with open(summ_path, 'r') as infile:
            summ_json = json.load(infile)
        with open(file_path, 'r') as infile:
            json_result = json.load(infile)
            did = json_result['doc_id']
            for i, item in enumerate(summ_json):
                if item['name'].find("|||") == -1:
                    continue
                if example_filter(item['text']):
                    continue
                document = Document(
                    dataset_id=dataset.id,
                    doc_id=json_result['doc_id'],
                    doc_json=json.dumps(json_result),
                    summary=json.dumps(item),
                    sanity_statement=sanity_data[did]["sanity_statement"],
                    sanity_answer=sanity_data[did]["sanity_answer"]
                )
                db.session.add(document)
                db.session.commit()
def init_database(db):
    # user = User(email='admin@localhost', password='******')
    # db.session.add(user)
    # db.session.commit()
    dataset_path = '../backend/BBC'
    dataset_name = os.path.split(dataset_path)[1]

    summaries_path = os.path.join(dataset_path, 'summaries')
    documents_path = os.path.join(dataset_path, 'documents')

    # Existing dataset
    #dataset = db.session.query(Dataset).filter_by(name='BBC').first()
    # Insert dataset
    dataset = Dataset(name="BBC_test")
    db.session.add(dataset)
    db.session.commit()

    # Insert documents
    for file in os.listdir(documents_path):
        file_path = os.path.join(documents_path, file)
        with open(file_path, 'r') as infile:
            json_result = json.load(infile)
            document = Document(dataset_id=dataset.id,
                                doc_id=json_result['doc_id'],
                                doc_json=json.dumps(json_result),
                                summary="aaaaaaa")
            db.session.add(document)
            db.session.commit()

    # Insert Summaries
    for folder in os.listdir(summaries_path):
        if folder.startswith('ref'):
            summary_group = SummaryGroup(name='%s_ref_%s' %
                                         (dataset_name, folder[4:]),
                                         dataset_id=dataset.id,
                                         is_ref=True)
        elif folder.startswith('system'):
            summary_group = SummaryGroup(name='%s_system_%s' %
                                         (dataset_name, folder[7:]),
                                         dataset_id=dataset.id,
                                         is_ref=False)
        else:
            break
        db.session.add(summary_group)
        db.session.commit()
        ref_path = os.path.join(summaries_path, folder)
        for file in os.listdir(ref_path):
            with open(os.path.join(ref_path, file), 'r') as infile:
                text = ' '.join(infile.readlines()).strip()
                document = db.session.query(Document).filter_by(
                    doc_id=os.path.splitext(file)[0]).first()
                summary = Summary(doc_id=document.id,
                                  text=text,
                                  summary_group_id=summary_group.id)
                db.session.add(summary)
                db.session.commit()
def init_database(db):
    # user = User(email='admin@localhost', password='******')
    # db.session.add(user)
    # db.session.commit()
    dataset_path = '../backend/BBC_pair'
    dataset_name = os.path.split(dataset_path)[1]

    summaries_path = os.path.join(dataset_path, 'summaries')
    documents_path = os.path.join(dataset_path, 'documents')
    sanity_path = os.path.join(dataset_path, 'sanity_id/sanity.txt')

    # Existing dataset
    #dataset = db.session.query(Dataset).filter_by(name='BBC').first()
    # Insert dataset
    dataset = Dataset(name="BBC")
    db.session.add(dataset)
    db.session.commit()

    sanity_data = {}
    for line in open(sanity_path):
        flist = line.strip().split("\t")
        sanity_data[flist[0]] = {
            "sanity_answer": bool(int(flist[2])),
            "sanity_statement": flist[1]
        }

    # Insert documents
    for file in os.listdir(documents_path):
        file_path = os.path.join(documents_path, file)
        summ_path = os.path.join(summaries_path, file)
        with open(summ_path, 'r') as infile:
            summ_json = json.load(infile)
        with open(file_path, 'r') as infile:
            json_result = json.load(infile)
            did = json_result['doc_id']
            for i, item in enumerate(summ_json):
                document = Document(
                    dataset_id=dataset.id,
                    doc_id=json_result['doc_id'],
                    doc_json=json.dumps(json_result),
                    summary=json.dumps(item),
                    sanity_statement=sanity_data[did]["sanity_statement"],
                    sanity_answer=sanity_data[did]["sanity_answer"])
                db.session.add(document)
                db.session.commit()
示例#6
0
def init_database(db, input_path, db_id, example_id):
    # Insert dataset
    db_name = "WebNLG_" + str(db_id)
    dataset = Dataset(name=db_name)
    db.session.add(dataset)
    db.session.commit()

    with open(input_path, 'r') as infile:
        json_obj = json.load(infile)

    for obj in json_obj:
        for i in range(3):
            example_id += 1
            example = Example(id=example_id,
                              dataset_id=dataset.id,
                              ex_id=obj['ID'],
                              tgt_id=i,
                              src_json=json.dumps(obj['SRC']),
                              tgt_json=json.dumps(obj['TGT-' + str(i)]),
                              sanity_check=json.dumps(obj['CHK-' + str(i)]))
            db.session.add(example)
            db.session.commit()
    return example_id
示例#7
0
def init_database(db):
    user = User(email='admin@localhost', password='******')
    db.session.add(user)
    db.session.commit()
    dataset_path = '/home/acp16hh/Projects/Research/Experiments/Exp_Elly_Human_Evaluation/src/Mock_Dataset_2/BBC_Sample'
    # dataset_path = '/home/acp16hh/Projects/Research/Experiments/Exp_Elly_Human_Evaluation/src/Mock_Dataset_2/BBC'
    dataset_name = os.path.split(dataset_path)[1]

    summaries_path = os.path.join(dataset_path, 'summaries')
    documents_path = os.path.join(dataset_path, 'documents')

    # Existing dataset
    # dataset = db.session.query(Dataset).filter_by(name='BBC').first()
    # Insert dataset
    dataset = Dataset(name=dataset_name)
    db.session.add(dataset)
    db.session.commit()

    # Insert documents
    for file in os.listdir(documents_path):
        file_path = os.path.join(documents_path, file)
        with open(file_path, 'r') as infile:
            json_result = json.load(infile)
            document = Document(
                dataset_id=dataset.id,
                doc_id=json_result['doc_id'],
                doc_json=json.dumps(json_result)
            )
            db.session.add(document)
            db.session.commit()

    # Insert Summaries
    for folder in os.listdir(summaries_path):
        if folder.startswith('ref'):
            summary_group = SummaryGroup(name='%s_ref_%s' % (dataset_name, folder[4:]),
                                         dataset_id=dataset.id, is_ref=True)
        elif folder.startswith('system'):
            summary_group = SummaryGroup(name='%s_system_%s' % (dataset_name, folder[7:]),
                                         dataset_id=dataset.id, is_ref=False)
        else:
            break
        db.session.add(summary_group)
        db.session.commit()
        ref_path = os.path.join(summaries_path, folder)
        for file in os.listdir(ref_path):
            with open(os.path.join(ref_path, file), 'r') as infile:
                text = ' '.join(infile.readlines()).strip()
                document = db.session.query(Document).filter_by(doc_id=os.path.splitext(file)[0]).first()
                summary = Summary(
                    doc_id=document.id,
                    text=text,
                    summary_group_id=summary_group.id
                )
                db.session.add(summary)
                db.session.commit()

    # Insert Pairs
    ref_summary_groups = db.session.query(SummaryGroup).filter_by(dataset_id=dataset.id, is_ref=True).all()
    system_summary_groups = db.session.query(SummaryGroup).filter_by(dataset_id=dataset.id, is_ref=False).all()

    for ref_summ_group in ref_summary_groups:
        for system_summ_group in system_summary_groups:
            for system_summary in system_summ_group.summaries:
                ref_summary = db.session.query(Summary)\
                    .filter_by(summary_group_id=ref_summ_group.id, doc_id=system_summary.doc_id).first()
                summaries_pair = SummariesPair(
                    ref_summary_id=ref_summary.id,
                    system_summary_id=system_summary.id,
                    dataset_id=dataset.id
                )
                db.session.add(summaries_pair)
                db.session.commit()
def test_empty_description():
    """ Ensure that an empty description is allowed """
    Dataset(name="demo", description="")
def test_description_too_long():
    """ Ensure that dataset descriptions with length > 256 lead to validation error """
    max_length = 256
    Dataset(name="demo", description="a" * max_length)
    with pytest.raises(ValidationError):
        Dataset(name="demo", description="a" * (max_length + 1))
def test_name_too_long():
    """ Ensure that dataset names with length superior to 64 lead to validation error """
    max_length = 64
    Dataset(name="a" * max_length)
    with pytest.raises(ValidationError):
        Dataset(name="a" * (max_length + 1))
def test_name_missing_error():
    with pytest.raises(ValidationError):
        Dataset(visibility="something_else")
def test_visibility_validation_error():
    with pytest.raises(ValidationError):
        Dataset(name="demo", visibility="something_else")
def test_public_visibility():
    dataset = Dataset(name="demo", visibility="public")
    assert dataset.visibility == DatasetVisibility.PUBLIC
    assert dataset.visibility == "public"
def test_private_visibility():
    dataset = Dataset(name="demo", visibility="private")
    assert dataset.visibility == DatasetVisibility.PRIVATE
    assert dataset.visibility == "private"
def test_default():
    dataset = Dataset(name="demo")
    assert dataset.visibility == DatasetVisibility.PUBLIC
    assert dataset.visibility == "public"
    assert dataset.description is None