Пример #1
0
def test_encoding():
    original = 'SELECT ?city WHERE { ?m skos:broader dbc:Cities_in_Germany . ?city dct:subject ?m . ?city dbo:areaTotal ?area . ?b dbo:artist dbr:John_Halsey_(musician) } order by asc (?area)'
    expected_encoding = 'SELECT var_city WHERE brack_open var_m skos_broader dbc_Cities_in_Germany sep_dot var_city dct_subject var_m sep_dot var_city dbo_areaTotal var_area sep_dot var_b dbo_artist dbr_John_Halsey_ attr_open musician attr_close  brack_close _oba_ var_area '

    result = generator_utils.encode(original)

    assert result == expected_encoding
    assert str.strip(generator_utils.decode(result)) == original
Пример #2
0
def build_dataset_pair(binding, template):
    english = getattr(template, 'question')
    sparql = getattr(template, 'query')
    for variable in binding:
        uri = binding[variable]['uri']
        label = binding[variable]['label']
        placeholder = '<{}>'.format(str.upper(variable))
        if placeholder in english and label is not None:
            english = english.replace(placeholder, strip_brackets(label))
        if placeholder in sparql and uri is not None:
            sparql = sparql.replace(placeholder, uri)

    sparql = encode(sparql)
    dataset_pair = {'english': english, 'sparql': sparql}
    return dataset_pair
Пример #3
0
    with open(fp, "r", encoding="UTF-8") as text:
        for i, line in enumerate(text.readlines()):
            three = line.split("\t")
            query = three[1]
            if not (query.startswith("ASK") or query.startswith("ask")):
                simile = get_simile(query)
                if simile is not None:
                    lis = ast.literal_eval(three[-1])
                    flag = any([ent.startswith("http:") for ent in lis])
                    if flag:
                        queriset.append((simile, i))
                        #lis = ast.literal_eval(three[-1])
                        for ent in lis:
                            if ent.startswith("http:"):
                                ent = preprocess_sentence(
                                    generator_utils.encode(ent))
                                vec = glove_embedding(ent)
                                dataset.append((vec, i))
                            else:
                                continue
    #print("dataset length: ", len(dataset))
    #print("queriset length: ", len(queriset))
    def calcul_rank(queriset, dataset):
        mrrs = []
        hits10 = []
        hits100 = []
        for simile, i in queriset:
            #sorted_dataset = sorted(dataset, key=lambda pair:cosine(simile, pair[0]))
            sorted_dataset = sort_data(simile, dataset)

            def add(num):
Пример #4
0
    used_resources_root, _ = os.path.splitext(used_resources_file)
    filtered_sparql_file = '{}_filtered_{:d}_{}.sparql'.format(
        dataset_root, MINIMUM, COMP.__name__)
    filtered_en_file = '{}_filtered_{:d}_{}.en'.format(dataset_root, MINIMUM,
                                                       COMP.__name__)

    used_resources = collections.Counter(
        json.loads(open(used_resources_file).read()))
    filtered_resources = [
        elem_cnt for elem_cnt in list(used_resources.items())
        if elem_cnt[1] >= MINIMUM
    ]
    save_cache('{}_filter_{:d}.json'.format(used_resources_root, MINIMUM),
               collections.Counter(dict(filtered_resources)))
    valid_encoded_resources = [
        encode(elem_cnt1[0]) for elem_cnt1 in filtered_resources
    ]
    check = lambda encoded_entity: encoded_entity in valid_encoded_resources

    valid_lines = []
    filtered_queries = []
    with open(dataset_root + '.sparql', 'r') as sparql_file:
        for linenumber, line in enumerate(sparql_file):
            entities = extract_encoded_entities(line)
            valid = COMP(list(map(check, entities)))
            if valid:
                filtered_queries.append(line)
                valid_lines.append(linenumber)

    filtered_questions = []
    with open(dataset_root + '.en', 'r') as en_file:
Пример #5
0
    sys.setdefaultencoding("utf-8")

    dataset_root, _ = os.path.splitext(dataset_file)
    used_resources_root, _ = os.path.splitext(used_resources_file)
    filtered_sparql_file = '{}_filtered_{:d}_{}.sparql'.format(
        dataset_root, MINIMUM, COMP.__name__)
    filtered_en_file = '{}_filtered_{:d}_{}.en'.format(dataset_root, MINIMUM,
                                                       COMP.__name__)

    used_resources = collections.Counter(
        json.loads(open(used_resources_file).read()))
    filtered_resources = filter(lambda (elem, cnt): cnt >= MINIMUM,
                                used_resources.items())
    save_cache('{}_filter_{:d}.json'.format(used_resources_root, MINIMUM),
               collections.Counter(dict(filtered_resources)))
    valid_encoded_resources = map(lambda (elem, cnt): encode(elem),
                                  filtered_resources)
    check = lambda encoded_entity: encoded_entity in valid_encoded_resources

    valid_lines = []
    filtered_queries = []
    with open(dataset_root + '.sparql', 'r') as sparql_file:
        for linenumber, line in enumerate(sparql_file):
            entities = extract_encoded_entities(line)
            valid = COMP(map(check, entities))
            if valid:
                filtered_queries.append(line)
                valid_lines.append(linenumber)

    filtered_questions = []
    with open(dataset_root + '.en', 'r') as en_file:
Пример #6
0
    MINIMUM = int(args.minimum)
    COMP = any if args.comp == 'any' else all

    importlib.reload(sys)
    sys.setdefaultencoding("utf-8")


    dataset_root, _ = os.path.splitext(dataset_file)
    used_resources_root, _ = os.path.splitext(used_resources_file)
    filtered_sparql_file = '{}_filtered_{:d}_{}.sparql'.format(dataset_root, MINIMUM, COMP.__name__)
    filtered_en_file = '{}_filtered_{:d}_{}.en'.format(dataset_root, MINIMUM, COMP.__name__)

    used_resources = collections.Counter(json.loads(open(used_resources_file).read()))
    filtered_resources = [elem_cnt for elem_cnt in list(used_resources.items()) if elem_cnt[1] >= MINIMUM]
    save_cache('{}_filter_{:d}.json'.format(used_resources_root, MINIMUM), collections.Counter(dict(filtered_resources)))
    valid_encoded_resources = [encode(elem_cnt1[0]) for elem_cnt1 in filtered_resources]
    check = lambda encoded_entity : encoded_entity in valid_encoded_resources

    valid_lines = []
    filtered_queries = []
    with open(dataset_root+'.sparql', 'r') as sparql_file:
        for linenumber, line in enumerate(sparql_file):
            entities = extract_encoded_entities(line)
            valid = COMP(list(map(check, entities)))
            if valid:
                filtered_queries.append(line)
                valid_lines.append(linenumber)

    filtered_questions = []
    with open(dataset_root+'.en', 'r') as en_file:
        for linenumber, line in enumerate(en_file):
Пример #7
0
import argparse
from generator_utils import decode, encode

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('mode',
                        nargs='?',
                        choices=['encode', 'decode'],
                        default='decode')
    parser.add_argument('input_path')
    args = parser.parse_args()

    with open(args.input_path, 'r') as input_file:
        for line in input_file:
            if args.mode == 'decode':
                print(decode(line.strip()))
            elif args.mode == 'encode':
                print(encode(line.strip()))