def main():
    path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data_manual= path_project+os.sep+"input"+os.sep+"tagged_items.json"
    path_data_auto= path_project+os.sep+"input"+os.sep+"items_tagged_auto.json"
    data_manual=load_json(path_data_manual)
    data_auto=load_json(path_data_auto)
    def2result=compare_all(data_manual,data_auto)
    count_result(def2result,data_manual,data_auto)
Exemplo n.º 2
0
def main():
#     IE()
    IE_auto_pattern()
    path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data_manual= path_project+os.sep+"input"+os.sep+"items_tagged_modified.json"
    path_data_auto= path_project+os.sep+"output"+os.sep+"items_tagged_auto.json"
    data_manual=load_json(path_data_manual)
    data_auto=load_json(path_data_auto)
    def2result=compare_all(data_manual,data_auto)
    count_result(def2result,data_manual,data_auto)
    compare_all_different(data_manual,data_auto)
def main():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data = "items_tagged_modified.json"
    #     path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified_test.json"
    path_pattern = path_project + os.sep + "output" + os.sep + "Patterns_auto.json"
    path_pattern_sorted = path_project + os.sep + "output" + os.sep + "pattern_auto_sorted_by_attribute_pattern.json"
    path_pattern = "Patterns_auto.json"
    path_pattern_sorted = "pattern_auto_sorted_by_attribute_pattern.json"

    #     path_pattern_sorted_by_attribute=path_project+os.sep+"output"+os.sep+"pattern_auto_sorted_by_attribute.json"
    data = load_json(path_data)
    logger.info("loaded all the data")
    #acquire all the pattern candidates
    attribute2PSFSList = acquire_patterns(data)
    #     getVBContext(attribute2PSFSList)
    #filter the pattern candidates
    PASFFLSsList = filterPatterns(attribute2PSFSList)
    #merge the pattern candidates
    patterns = merge_pattern_all(PASFFLSsList)

    patterns_priority = calculate_proprity2pattern(patterns)
    logger.info("has acquired all the patterns")
    json.dump(patterns_priority,
              codecs.open(path_pattern, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
    #     json.dump(dict_reverse(patterns), codecs.open(path_pattern_reverse, 'w','utf-8'),ensure_ascii=False,indent=2)
    #     dict_sorted_value(dict_reverse(patterns),path_pattern_reverse_sorted)
    sorted_by_attribute_pattern(patterns_priority, path_pattern_sorted)
    logger.info("output over")
Exemplo n.º 4
0
def main():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data = path_project + os.sep + "input" + os.sep + "items_tagged_modified.json"
    #     path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified_test.json"
    path_pattern = path_project + os.sep + "output" + os.sep + "Patterns_auto.json"
    path_pattern_sorted = path_project + os.sep + "output" + os.sep + "pattern_auto_sorted_by_attribute_pattern.json"
    #     path_pattern_sorted_by_attribute=path_project+os.sep+"output"+os.sep+"pattern_auto_sorted_by_attribute.json"
    data = load_json(path_data)
    logger.info("loaded all the data")
    patternTfix_tokensTattribute_tuples = acquire_patterns(data)
    logger.info('patternTfix_tokensTattribute_tuples: %d\n' %
                len(patternTfix_tokensTattribute_tuples) +
                str(patternTfix_tokensTattribute_tuples))
    patterns = merge_pattern_all(patternTfix_tokensTattribute_tuples)

    patterns_priority = calculate_proprity2pattern(patterns)
    logger.info("has acquired all the patterns")
    json.dump(patterns_priority,
              codecs.open(path_pattern, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
    #     json.dump(dict_reverse(patterns), codecs.open(path_pattern_reverse, 'w','utf-8'),ensure_ascii=False,indent=2)
    #     dict_sorted_value(dict_reverse(patterns),path_pattern_reverse_sorted)
    sorted_by_attribute_pattern(patterns_priority, path_pattern_sorted)
    logger.info("output over")
def test2():
    path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_pattern= path_project+os.sep+"input"+os.sep+"patterns.json"
#     path_tagged_output= path_project+os.sep+"output"+os.sep+"patterns_priority.json"
    path_pattern_new= path_project+os.sep+"output"+os.sep+"patterns_priority_sorted.json"
    data=load_json(path_pattern)
    data_new=calculate_proprity2pattern(data)
#     json.dump(data_new, codecs.open(path_tagged_output, 'w','utf-8'),ensure_ascii=False,indent=2)
    sorted_pattern(data_new,path_pattern_new)
Exemplo n.º 6
0
def test():
    path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data= path_project+os.sep+"input"+os.sep+"items.json"
    path_output=path_project+os.sep+"output"+os.sep+"special_chars.txt"
    data=load_json(path_data)
    special_chars=find_special_char(data)
    fp=codecs.open(path_output, 'w','utf-8')
    for i in special_chars:
        fp.write(i+"\n")  
    print special_chars
def test1():
    path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified_test.json"
#     path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified_test.json"
    path_pattern= path_project+os.sep+"output"+os.sep+"patterns_auto_test.json"
    data=load_json(path_data)
    logger.info("loaded all the data")
    patterns=acquire_patterns(data)
    logger.info("has acquired all the patterns")
    json.dump(patterns, codecs.open(path_pattern, 'w','utf-8'),ensure_ascii=False,indent=2)
    logger.info("output over")
def main():
    path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data= path_project+os.sep+"input"+os.sep+"tagged_items.json"
    path_pattern= path_project+os.sep+"input"+os.sep+"Patterns.json"
#     path_data= path_project+os.sep+"input"+os.sep+"items.json"
#     path_data_output=path_project+os.sep+"output"+os.sep+"test_items.json"
    path_tagged_output=path_project+os.sep+"output"+os.sep+"test_items_tagged.txt"
    pattern2attrubute=get_all_pattern(load_patterns(path_pattern))
    data=load_json(path_data)
    data_new=extract_all_items(data,pattern2attrubute.keys())
    json.dump(data_new, codecs.open(path_tagged_output, 'w','utf-8'),ensure_ascii=False,indent=2)
def test7():
    path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified.json"
    data=load_json(path_data)
    attributes=set([])
    for item in data:
        concept,pronunciation,pos2definition=extract_item_properties(item)
        for pos2def in pos2definition:
            definition=pos2def['definition']
            grammar = "NP: {<DT>?<JJ>*<NN>}"
            tokens=nltk.word_tokenize(definition)
            tagged=nltk.pos_tag(tokens)
            print nltk.RegexpParser(grammar).parse(tagged)
def test6():
    path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified.json"
    data=load_json(path_data)
    attributes=set([])
    for item in data:
        concept,pronunciation,pos2definition=extract_item_properties(item)
        for pos2def in pos2definition:
            for attribute in pos2def['attributes'].keys():
                attributes.add(attribute)
                
    for x in sorted(list(attributes)):
        print x
def test4():
    path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data= path_project+os.sep+"input"+os.sep+"items_modified.json"
    path_pattern= path_project+os.sep+"input"+os.sep+"patterns.json"
#     path_data= path_project+os.sep+"input"+os.sep+"items.json"
#     path_data_output=path_project+os.sep+"output"+os.sep+"test_items.json"
    path_tagged_output=path_project+os.sep+"output"+os.sep+"items_modified_auto.json"
    pattern2attrubute=load_patterns(path_pattern)
    logger.info("loaded all the patterns")
    data=load_json(path_data)
    logger.info("loaded all the data")
    data_new=extract_items_all(data,pattern2attrubute)
#     data_new=extractor_multi_thread(data,pattern2attrubute,stanford_tagger)
    logger.info("has extracted all the attributes")
    json.dump(data_new, codecs.open(path_tagged_output, 'w','utf-8'),ensure_ascii=False,indent=2)
    logger.info("output over")  
def main():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data = path_project + os.sep + "input" + os.sep + "items.json"
    #     path_data= path_project+os.sep+"input"+os.sep+"items_test.json"
    path_new_data = path_project + os.sep + "output" + os.sep + "items_modified.json"
    #     path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified1.json"
    #     path_data= path_project+os.sep+"input"+os.sep+"items_tagged.json"
    #     path_new_data=path_project+os.sep+"output"+os.sep+"items_tagged_modified.json"
    #     path_data= path_project+os.sep+"input"+os.sep+"items_test2.json"
    #     path_new_data=path_project+os.sep+"output"+os.sep+"items_modified_test.json"
    data = load_json(path_data)
    #     analysis_data(data)
    data_new = modify_data(data)
    json.dump(data_new,
              codecs.open(path_new_data, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
Exemplo n.º 13
0
def test5():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data = path_project + os.sep + "input" + os.sep + "items_tagged_modified.json"

    #     path_pattern= path_project+os.sep+"output"+os.sep+"Patterns_auto.json"
    data = load_json(path_data)
    logger.info("loaded all the data")
    values = []
    for item in data:
        pos2definition = item["pos2definition"]
        for pos2def in pos2definition:
            for value in pos2def["attributes"].values():
                values.append(len(value.split(" ")))
    c_value_len = Counter(values)
    print sorted(c_value_len.iteritems(),
                 key=lambda asd: asd[0],
                 reverse=False)
    logger.info("output over")
Exemplo n.º 14
0
def test8():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data = path_project + os.sep + "output" + os.sep + "patterns_priority.json"
    path_data_new = path_project + os.sep + "output" + os.sep + "patterns_priority_new.json"
    data = load_json(path_data)
    data_new = {}
    print data
    for pattern, pattern_name in data.iteritems():
        #         print item
        pattern_new = tranfer_pattern(pattern)
        data_new[pattern_new] = pattern_name
    sorted_data = sorted(data.iteritems(),
                         key=lambda asd: asd[1],
                         reverse=True)
    json.dump(sorted_data,
              codecs.open(path_data_new, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
Exemplo n.º 15
0
def test7():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data = path_project + os.sep + "input" + os.sep + "items_tagged_modified.json"
    path_tagged_output = path_project + os.sep + "output" + os.sep + "items_tagged_modified_no_bracket.json"
    #     path_pattern= path_project+os.sep+"output"+os.sep+"Patterns_auto.json"
    data = load_json(path_data)
    logger.info("loaded all the data")
    data_new = []
    for item in data:
        pos2definition = item["pos2definition"]
        for pos2def in pos2definition:
            for attribute, value in pos2def["attributes"].iteritems():
                pos2def["attributes"][attribute] = re.sub(
                    r'\([\s\S]*?\)', "", value)
        data_new.append(item)
    json.dump(data_new,
              codecs.open(path_tagged_output, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
    logger.info("output over")
Exemplo n.º 16
0
def test2():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data = path_project + os.sep + "input" + os.sep + "tagged_items.json"
    data = load_json(path_data)
    tagger = get_tagger()
    cnt_same_pos_all = 0
    cnt_same_word_all = 0
    for item in data:
        concept, pronunciation, pos2definition = extract_item_properties(item)
        for pos2def in pos2definition:
            definition = pos2def["definition"]
        text = nltk.word_tokenize(definition)
        def_pos1 = tagger.tag(text)
        logger.info(def_pos1)
        def_pos2 = nltk.pos_tag(text)
        logger.info(def_pos2)
        similar, cnt_same_pos, cnt_same_word = compare_similar_pos(
            def_pos1, def_pos2)
        cnt_same_pos_all += cnt_same_pos
        cnt_same_word_all += cnt_same_word
    print float(cnt_same_pos_all) / cnt_same_word_all
def main():
    path_project = os.path.abspath(
        os.path.join(os.getcwd(), os.pardir, os.pardir))
    path_data = path_project + os.sep + "input" + os.sep + "items_tagged_modified.json"
    #     path_data= path_project+os.sep+"input"+os.sep+"items_tagged_modified_test.json"
    path_pattern = path_project + os.sep + "output" + os.sep + "Patterns_auto.json"
    path_pattern_reverse = path_project + os.sep + "output" + os.sep + "Patterns_auto_reverse.json"
    path_pattern_reverse_sorted = path_project + os.sep + "output" + os.sep + "Patterns_auto_reverse_sorted.json"
    data = load_json(path_data)
    logger.info("loaded all the data")
    patterns = acquire_patterns(data)
    logger.info("has acquired all the patterns")
    json.dump(patterns,
              codecs.open(path_pattern, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
    json.dump(dict_reverse(patterns),
              codecs.open(path_pattern_reverse, 'w', 'utf-8'),
              ensure_ascii=False,
              indent=2)
    dict_sorted_value(dict_reverse(patterns), path_pattern_reverse_sorted)
    logger.info("output over")
def load_patterns(path):
    return load_json(path)