Пример #1
0
def get_cate_info(pure_entities,cate_info_file):
    if os.path.exists(cate_info_file):
        cate_info = json.load(open(cate_info_file))
        new_cate = []
        for entity in pure_entities:
            if entity not in cate_info:
                new_cate.append(entity)
        if len(new_cate)!=0:
            cate_info.update(get_cate_for_entity_list(new_cate))
        with codecs.open(cate_info_file,'w','utf-8') as f:
            f.write(json.dumps(cate_info)) 
    else:
        cate_info = get_cate_for_entity_list(list(pure_entities) )
        with codecs.open(cate_info_file,'w','utf-8') as f:
            f.write(json.dumps(cate_info)) 
    return cate_info
Пример #2
0
def main():
    reload(sys)
    sys.setdefaultencoding('UTF8')
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("candiate_file")
    parser.add_argument("article_dir")
    parser.add_argument("feature_dir")
    parser.add_argument("dest_dir")
    parser.add_argument("--candiate_top",'-ct',type=int,default=20)
    parser.add_argument("--type",'-t',default="ORGANIZATION")
    parser.add_argument("--normalize",'-n',action="store_true")

    args=parser.parse_args()

    entity_candidates = get_candidates(args.candiate_file,args.candiate_top)
    candidate_models = get_candidate_models(entity_candidates,args.article_dir,args.type)
    
    all_words = json.load(open(os.path.join(args.feature_dir,"all_words")))
    all_cates = json.load(open(os.path.join(args.feature_dir,"all_cates")))

    cate_info = get_cate_for_entity_list(candidate_models.keys())

    test_candidates = []
    test_vector = []
    for entity in candidate_models:
        if args.normalize:
            candidate_models[entity].normalize()
        test_candidates.append(entity)
        feature_vector = get_feature_vector(all_words,all_cates,candidate_models[entity].model,cate_info[entity])
        test_vector.append(feature_vector)

    with codecs.open(os.path.join(args.dest_dir,"test_vector"),"w",'utf-8') as f:
        f.write(json.dumps(test_vector))
    with codecs.open(os.path.join(args.dest_dir,"test_candidates"),"w",'utf-8') as f:
        f.write(json.dumps(test_candidates))
Пример #3
0
def main():
    reload(sys)
    sys.setdefaultencoding('UTF8')
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("candiate_file")
    parser.add_argument("article_dir")
    parser.add_argument("feature_dir")
    parser.add_argument("dest_dir")
    parser.add_argument("--candiate_top", '-ct', type=int, default=20)
    parser.add_argument("--type", '-t', default="ORGANIZATION")
    parser.add_argument("--normalize", '-n', action="store_true")

    args = parser.parse_args()

    entity_candidates = get_candidates(args.candiate_file, args.candiate_top)
    candidate_models = get_candidate_models(entity_candidates,
                                            args.article_dir, args.type)

    all_words = json.load(open(os.path.join(args.feature_dir, "all_words")))
    all_cates = json.load(open(os.path.join(args.feature_dir, "all_cates")))

    cate_info = get_cate_for_entity_list(candidate_models.keys())

    test_candidates = []
    test_vector = []
    for entity in candidate_models:
        if args.normalize:
            candidate_models[entity].normalize()
        test_candidates.append(entity)
        feature_vector = get_feature_vector(all_words, all_cates,
                                            candidate_models[entity].model,
                                            cate_info[entity])
        test_vector.append(feature_vector)

    with codecs.open(os.path.join(args.dest_dir, "test_vector"), "w",
                     'utf-8') as f:
        f.write(json.dumps(test_vector))
    with codecs.open(os.path.join(args.dest_dir, "test_candidates"), "w",
                     'utf-8') as f:
        f.write(json.dumps(test_candidates))
Пример #4
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("disaster_name")
    parser.add_argument(
        "--top_dir",
        '-tp',
        default=
        '/lustre/scratch/lukuang/Temporal_Summerization/TS-2013/data/disaster_profile/data'
    )
    parser.add_argument("dest_dir")
    parser.add_argument("--type", '-t', default="ORGANIZATION")
    parser.add_argument("--normalize", '-n', action="store_true")
    parser.add_argument("--small", '-s', action="store_true")
    parser.add_argument(
        "--entity_judgement_file",
        "-e",
        default=
        "/lustre/scratch/lukuang/Temporal_Summerization/TS-2013/data/disaster_profile/data/src/entities_judgement.json"
    )
    args = parser.parse_args()

    entities_judgement = get_entities_judgement(args.entity_judgement_file,
                                                args.type, args.small)

    args.top_dir = os.path.abspath(args.top_dir)
    instance_names = entities_judgement.keys()
    documents = get_documents(instance_names, args.top_dir, args.disaster_name)

    entity_dir = os.path.join(args.top_dir, "entity", args.disaster_name)
    negative_candidates = get_negative_candidates(instance_names, entity_dir,
                                                  args.type,
                                                  entities_judgement)

    windows = get_all_sentence_windows(documents, entities_judgement,
                                       negative_candidates, args.type)

    all_entities = []
    all_words = {}
    pure_entities = set()
    for instance in windows:
        for entity in windows[instance]:
            pure_entities.add(entity)
            for word in windows[instance][entity].model:
                if word not in all_words:
                    all_words[word] = 0

    all_words = all_words.keys()
    all_features = all_words

    cate_info = get_cate_for_entity_list(list(pure_entities))
    all_cates = []
    for entity in cate_info:
        if cate_info[entity]:
            for cate in cate_info[entity]:
                if cate not in all_cates:
                    all_cates.append(cate)

    all_features += all_cates

    judgement_vector = []
    feature_vector = []

    for instance in entities_judgement:
        for entity in negative_candidates[instance][args.type]:
            all_entities.append(instance + "/" + entity)
            judgement_vector.append(-1)
            single_feature_vectore = []
            if args.normalize:
                windows[instance][entity].normalize()
            for w in all_words:
                if w in windows[instance][entity].model:
                    single_feature_vectore.append(
                        windows[instance][entity].model[w])
                else:
                    single_feature_vectore.append(0)

            if cate_info[entity]:
                for cate in all_cates:
                    if cate not in cate_info[entity]:
                        single_feature_vectore.append(0)
                    else:
                        single_feature_vectore.append(1)
            else:
                single_feature_vectore += [0] * len(all_cates)
            feature_vector.append(single_feature_vectore)

        for entity in entities_judgement[instance][args.type]:
            all_entities.append(instance + "/" + entity)
            judgement_vector.append(1)
            single_feature_vectore = []
            if args.normalize:
                windows[instance][entity].normalize()
            for w in all_words:

                if w in windows[instance][entity].model:
                    single_feature_vectore.append(
                        windows[instance][entity].model[w])
                else:
                    single_feature_vectore.append(0)

            if cate_info[entity]:
                for cate in all_cates:
                    if cate not in cate_info[entity]:
                        single_feature_vectore.append(0)
                    else:
                        single_feature_vectore.append(1)
            else:
                single_feature_vectore += [0] * len(all_cates)
            feature_vector.append(single_feature_vectore)

    with codecs.open(os.path.join(args.dest_dir, "feature_vector"), "w",
                     "utf-8") as f:
        f.write(json.dumps(feature_vector))
    #print json.dumps(windows,indent=4)
    with codecs.open(os.path.join(args.dest_dir, "judgement_vector"), "w",
                     "utf-8") as f:
        f.write(json.dumps(judgement_vector))
    with codecs.open(os.path.join(args.dest_dir, "all_entities"), "w",
                     "utf-8") as f:
        f.write(json.dumps(all_entities))
    with codecs.open(os.path.join(args.dest_dir, "all_words"), "w",
                     "utf-8") as f:
        f.write(json.dumps(all_words))
    with codecs.open(os.path.join(args.dest_dir, "all_cates"), "w",
                     "utf-8") as f:
        f.write(json.dumps(all_cates))
    print "finished"
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("disaster_name")
    parser.add_argument("--top_dir",'-tp',default='/lustre/scratch/lukuang/Temporal_Summerization/TS-2013/data/disaster_profile/data')
    parser.add_argument("dest_dir")
    parser.add_argument("--type",'-t',default="ORGANIZATION")
    parser.add_argument("--normalize",'-n',action="store_true")
    parser.add_argument("--small",'-s',action="store_true")
    parser.add_argument("--entity_judgement_file","-e",default="/lustre/scratch/lukuang/Temporal_Summerization/TS-2013/data/disaster_profile/data/src/entities_judgement.json")
    args=parser.parse_args()
    
    

    entities_judgement = get_entities_judgement(args.entity_judgement_file,args.type,args.small)

    args.top_dir = os.path.abspath(args.top_dir)
    instance_names = entities_judgement.keys()
    documents = get_documents(instance_names,args.top_dir,args.disaster_name)

    entity_dir = os.path.join(args.top_dir,"entity",args.disaster_name)
    negative_candidates = get_negative_candidates(instance_names,entity_dir,args.type,entities_judgement)   



    windows = get_all_sentence_windows(documents,entities_judgement,negative_candidates, args.type)
  
    all_entities = []
    all_words = {}
    pure_entities = set()
    for instance in windows:
        for entity in windows[instance]:
            pure_entities.add(entity)
            for word in windows[instance][entity].model:
                if word not in all_words:
                    all_words[word] = 0

    all_words = all_words.keys()
    all_features = all_words

    cate_info = get_cate_for_entity_list(list(pure_entities) )
    all_cates = []
    for entity in cate_info:
        if cate_info[entity]:
            for cate in cate_info[entity]:
                if cate not in all_cates:
                    all_cates.append(cate)

    all_features += all_cates
   

    judgement_vector = []
    feature_vector = []



    for instance in entities_judgement:
        for entity in negative_candidates[instance][args.type]:
            all_entities.append(instance+"/"+entity)
            judgement_vector.append(-1)
            single_feature_vectore = []
            if args.normalize:
                windows[instance][entity].normalize()
            for w in all_words:
                if w in windows[instance][entity].model:
                    single_feature_vectore.append(windows[instance][entity].model[w])
                else:
                    single_feature_vectore.append(0)

            if cate_info[entity]:
                for cate in all_cates:
                    if cate not in cate_info[entity]:
                        single_feature_vectore.append(0)
                    else:
                        single_feature_vectore.append(1)
            else:
                single_feature_vectore += [0]*len(all_cates)
            feature_vector.append(single_feature_vectore)


        for entity in entities_judgement[instance][args.type]:
            all_entities.append(instance+"/"+entity)
            judgement_vector.append(1)
            single_feature_vectore = []
            if args.normalize:
                windows[instance][entity].normalize()
            for w in all_words:

                if w in windows[instance][entity].model:
                    single_feature_vectore.append(windows[instance][entity].model[w])
                else:
                    single_feature_vectore.append(0)

            if cate_info[entity]:
                for cate in all_cates:
                    if cate not in cate_info[entity]:
                        single_feature_vectore.append(0)
                    else:
                        single_feature_vectore.append(1)
            else:
                single_feature_vectore += [0]*len(all_cates)
            feature_vector.append(single_feature_vectore)

     



    with codecs.open(os.path.join(args.dest_dir,"feature_vector"),"w","utf-8") as f:
        f.write(json.dumps(feature_vector))
    #print json.dumps(windows,indent=4)
    with codecs.open(os.path.join(args.dest_dir,"judgement_vector"),"w","utf-8") as f:
        f.write(json.dumps(judgement_vector))
    with codecs.open(os.path.join(args.dest_dir,"all_entities"),"w","utf-8") as f:
        f.write(json.dumps(all_entities))
    with codecs.open(os.path.join(args.dest_dir,"all_words"),"w","utf-8") as f:
        f.write(json.dumps(all_words))
    with codecs.open(os.path.join(args.dest_dir,"all_cates"),"w","utf-8") as f:
        f.write(json.dumps(all_cates))
    print "finished"