Пример #1
0
def evaluate_one_model(pickle_path,
                       pk_sys_set,
                       system_path,
                       system_id,
                       modeltype='mymodel',
                       topksent=10,
                       on_mance=False,
                       on_sysce=False):
    print "model type =", modeltype, "top k =", topksent
    for i, (file_name, system_name) in enumerate(pk_sys_set):
        print i, file_name
        pickle_file = os.path.join(pickle_path, file_name)
        if modeltype == 'mymodel':  # MyModel.py
            model = MyModel(pickle_file)
        if modeltype == 'tfidf':  # tf_idf.py
            model = TfIdf(pickle_file)
        if modeltype == 'graphb':  # graph_base.py
            model = GraphBased(pickle_file)
        if modeltype == 'graphw':  # word_graph.py
            model = WordGraph(pickle_file)
        if modeltype == 'context1':  # sxpContextModel.py  SubPara Model
            model = conTextModel(pickle_file)
        if modeltype == 'mysecmodel':  # MySecModel.py
            model = MySecModel(pickle_file)
        if modeltype == 'myseccontextmodel':  # MySecContextModel.py SecSub Model
            model = SecConTextModel(pickle_file)
        if modeltype == 'hybrid':  # sxpHybridGraph.py
            model = HybridGraph(pickle_file)
        if modeltype == 'sectitle':  # MySecTitleNetwork.py
            model = MySecTitleModel(pickle_file)
        # -- save top k sentences to text --
        topksent_path = os.path.join(system_path,
                                     system_name + "." + system_id + ".txt")
        tops = model.OutPutTopKSent(topksent, 1, -1)
        st = ProduceSystem(tops, file_name, 1)
        WriteStrFile(topksent_path, st, 'utf-8')
        if on_mance:
            topk_mance_sent_path = os.path.join(
                system_path, system_name + "." + system_id + ".mance.txt")
            topk_mance_sent = OutPutTopKmanCESent(model, topksent, 1, -1)
            mance_st = ProduceSystem(topk_mance_sent, file_name, 1)
            WriteStrFile(topk_mance_sent_path, mance_st, 'utf-8')
        if on_sysce:
            topk_sysce_sent_path = os.path.join(
                system_path, system_name + "." + system_id + ".sysce.txt")
            topk_sysce_sent = OutPutTopKsysCESent(model, topksent, 1, -1)
            sysce_st = ProduceSystem(topk_sysce_sent, file_name, 1)
            WriteStrFile(topk_sysce_sent_path, sysce_st, 'utf-8')
        # -- save all ranked sentences to text --
        allsent = model.OutputAllRankSentence()
        pkfname = topksent_path + 'allsent.pk'
        StoreSxptext(allsent, pkfname)
        i += 1
    print "ranking complete!"
Пример #2
0
def evaluate_all_duc(fp, papers_path, pk_path, modeltype='mymodel', inc_abscon=True, useabstr=1, maxtxt=-1, topk=10):
    print "model type =", modeltype
    # -- if modeltype is not in our model list, not execute the following process --
    if modeltype not in idname.keys():
        return
    # -- get algorithms generated summaries directory --
    if not inc_abscon:
        system_dir = os.path.join(fp, 'systemFliter_html1')
    else:
        system_dir = os.path.join(fp, 'systemFliter_html2')
    if not os.path.exists(system_dir):
        os.makedirs(system_dir)

    # -- get the original filename list in the path variable --
    file_list = sxpGetDirFileList(papers_path)[0]

    # -- for each file, get its sxpText and run modeltype ranking model on it --
    for i, file_name in enumerate(file_list):
        # ---- get file name ----
        fset = file_name.split('.')
        if fset[-1] != 'xhtml':  # if current file is not a txt file, it is not one of the original papers
            continue
        print "processing", i, "th paper named as", file_name
        # -- get single pickle file directory --
        if not inc_abscon:
            pickle_path = os.path.join(pk_path, file_name +'_1.pickle')
        else:
            pickle_path = os.path.join(pk_path, file_name +'_2.pickle')
        # -- run ranking model on sxpText object at pickle_path --
        if modeltype == 'mymodel':
            model = MyModel(pickle_path)
        if modeltype == 'tfidf':
            model = TfIdf(pickle_path)
        if modeltype == 'graphb':
            model = GraphBased(pickle_path)
        if modeltype == 'graphw':
            model = WordGraph(pickle_path)
        if modeltype == 'context1':
            model = conTextModel(pickle_path)
        if modeltype == 'mysecmodel':
            model = MySecModel(pickle_path)
        if modeltype == 'myseccontextmodel':
            model = SecConTextModel(pickle_path)
        if modeltype == 'hybrid':  # sxpHybridGraph.py
            model = HybridGraph(pickle_path)
        if modeltype == 'sectitle':  # MySecTitleNetwork.py
            model = MySecTitleModel(pickle_path)

        # -- get the .html file's suffix NO. --
        idstr = idname[modeltype]
        # -- save original model ranked topk sentences to text --
        topksent_path = os.path.join(system_dir, file_name+'.html.'+idstr)
        tops = model.OutPutTopKSent(topk, useabstr, maxtxt)
        st = ProduceSystem(tops, file_name, 1)
        WriteStrFile(topksent_path, st, 'utf-8')
        # -- save pattern matched cause-effect links ranked topk sentences to text --
        sysce_topk = topk if topk <= len(model.text.sysce_sent_id_dict) else len(model.text.sysce_sent_id_dict)
        if sysce_topk > 0:
            topk_sysce_sent_path = os.path.join(system_dir, file_name+'.html.'+str(30 + int(idstr)))
            topk_sysce_sent = OutPutTopKsysCESent(model, sysce_topk, useabstr, maxtxt)
            sysce_st = ProduceSystem(topk_sysce_sent, file_name, 1)
            WriteStrFile(topk_sysce_sent_path, sysce_st, 'utf-8')
    print "ranking complete"
def evaluate_all_kg_sec(system_path,
                        pk_path,
                        modeltype='mymodel',
                        ce_opt='mance',
                        bias=0.65,
                        fname_topk_dict={},
                        fname_maxstr_dict={},
                        useabstr=0,
                        maxstr=2500,
                        topk=25):
    print "model type =", modeltype
    # -- get the original filename list in the path variable --
    file_list = sxpGetDirFileList(pk_path)[0]

    # -- for each file, get its sxpText and run modeltype ranking model on it --
    for i, file_name in enumerate(file_list):
        # ---- get file name ----
        fset = file_name.split('.')
        fname = '.'.join(fset[0:2])
        secname = fset[2]
        if fset[-1] != 'pk':  # if current file is not a pickle file, it is not one of the original papers
            continue

        # ---- Get system summaries directory ----
        system_dir = os.path.join(system_path, fname, secname)
        if not os.path.exists(system_dir):
            os.makedirs(system_dir)

        # ---- get the upper bound of a system summary's str number ----
        if fname_maxstr_dict.has_key(fname + '.' + secname):
            maxstr = fname_maxstr_dict[fname + '.' + secname]
        if fname_topk_dict.has_key(fname + '.' + secname):
            topk = fname_topk_dict[fname + '.' + secname]

        print "processing", i, "th paper named as", file_name
        # -- get single pickle file directory --
        pickle_path = os.path.join(pk_path, file_name)
        # ----------- Original Sentence Ranking Models --------------
        if modeltype == 'mymodel':
            model = MyModel(pickle_path)
        if modeltype == 'tfidf':
            model = TfIdf(pickle_path)
        if modeltype == 'graphb':
            model = GraphBased(pickle_path)
        if modeltype == 'graphw':
            model = WordGraph(pickle_path)
        if modeltype == 'context1':
            model = conTextModel(pickle_path)
        if modeltype == 'mysecmodel':
            model = MySecModel(pickle_path)
        if modeltype == 'myseccontextmodel':
            model = SecConTextModel(pickle_path)
        if modeltype == 'hybrid':  # sxpHybridGraph.py
            model = HybridGraph(pickle_path)
        if modeltype == 'sectitle':  # MySecTitleNetwork.py
            model = MySecTitleModel(pickle_path)
        # --------------- CEBias Ranking Models -------------------
        if modeltype == 'mymodel2':
            model = MyModel2(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'tfidf2':
            model = TfIdf2(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'graphb2':
            model = GraphBased2(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'graphw2':
            model = WordGraph2(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'context2':
            model = conTextModel2(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'mysecmodel2':
            model = MySecModel2(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'myseccontextmodel2':
            model = SecConTextModel2(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'hybrid2':  # sxpHybridGraph.py
            model = HybridGraph2(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'sectitle2':  # MySecTitleNetwork.py
            model = MySecTitleModel2(pickle_path, opt=ce_opt, bias=bias)
        # --------------- CEIter Ranking Models -------------------
        if modeltype == 'mymodel3':
            model = MyModel3(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'context3':
            model = conTextModel3(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'mysecmodel3':
            model = MySecModel3(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'myseccontextmodel3':
            model = SecConTextModel3(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'hybrid3':  # sxpHybridGraph.py
            model = HybridGraph3(pickle_path, opt=ce_opt, bias=bias)
        if modeltype == 'sectitle3':  # MySecTitleNetwork.py
            model = MySecTitleModel3(pickle_path, opt=ce_opt, bias=bias)

        # -- get the .html file's suffix NO. --
        if modeltype in idname.keys():
            idstr = idname[modeltype]

        # -- save original model ranked topk sentences to text --
        topksent_path = os.path.join(system_dir,
                                     fname + '.' + secname + '.html.' + idstr)
        topksent_path_2 = os.path.join(
            system_dir, fname + '.' + secname + '.html.sent.' + idstr)
        tops = model.OutPutTopKSent(topk, useabstr, maxstr)
        st = ProduceSystem(tops, fname + '.' + secname, 1)
        if useabstr < 0:
            WriteStrFile(topksent_path_2, st, 'utf-8')
        else:
            WriteStrFile(topksent_path, st, 'utf-8')

    print "ranking complete"
Пример #4
0
def evaluate_all_duc(fp,
                     papers_path,
                     pk_path,
                     modeltype='mymodel',
                     inc_abscon=True,
                     useabstr=1,
                     maxtxt=-1,
                     topk=10):
    print "model type =", modeltype
    # -- if modeltype is not in our model list, not execute the following process --
    if modeltype not in idname.keys():
        return
    # -- get algorithms generated summaries directory --
    if not inc_abscon:
        system_dir = os.path.join(fp, 'systemPure_html1')
        rankedsentdir = os.path.join(fp, 'RankedSentPure_1')
    else:
        system_dir = os.path.join(fp, 'systemPure_html2')
        rankedsentdir = os.path.join(fp, 'RankedSentPure_2')
    if not os.path.exists(system_dir):
        os.makedirs(system_dir)
    if not os.path.exists(rankedsentdir):
        os.makedirs(rankedsentdir)

    # -- get the original filename list in the path variable --
    file_list = sxpGetDirFileList(papers_path)[0]

    # -- for each file, get its sxpText and run modeltype ranking model on it --
    for i, file_name in enumerate(file_list):
        # ---- get file name ----
        if not re.match(ur'AP\d{6}-\d{4}|FBIS\d-\d+', file_name
                        ):  # if current file is not one of the original papers
            continue
        print "processing", i, "th paper named as", file_name
        # -- get single pickle file directory --
        if not inc_abscon:
            pickle_path = os.path.join(pk_path, file_name)
        else:
            pickle_path = os.path.join(pk_path, file_name)
        # -- run ranking model on sxpText object at pickle_path --
        if modeltype == 'mymodel':
            model = MyModel(pickle_path)
        if modeltype == 'tfidf':
            model = TfIdf(pickle_path)
        if modeltype == 'graphb':
            model = GraphBased(pickle_path)
        if modeltype == 'graphw':
            model = WordGraph(pickle_path)
        if modeltype == 'context1':
            model = conTextModel(pickle_path)
        if modeltype == 'mysecmodel':
            model = MySecModel(pickle_path)
        if modeltype == 'myseccontextmodel':
            model = SecConTextModel(pickle_path)
        if modeltype == 'hybrid':  # sxpHybridGraph.py
            model = HybridGraph(pickle_path)
        if modeltype == 'sectitle':  # MySecTitleNetwork.py
            model = MySecTitleModel(pickle_path)

        # -- get the .html file's suffix NO. --
        idstr = idname[modeltype]
        # -- save ranked sentence to a pickle file --
        ranked_sent_fp = os.path.join(
            rankedsentdir, file_name + ".html." + idstr + ".allsent.pk")
        sentidlst = sorted(model.text.ce_sys.iteritems(),
                           key=lambda asd: asd[1],
                           reverse=True)
        ranked_sentences = [
            model.text.sentenceset[sentid[0]].sentence_text
            for sentid in sentidlst
        ]
        StoreSxptext(ranked_sentences, ranked_sent_fp)
        # -- save original model ranked topk sentences to text --
        topksent_path = os.path.join(system_dir, file_name + '.html.' + idstr)
        tops = model.OutPutTopKSent(topk, useabstr, maxtxt)
        st = ProduceSystem(tops, file_name, 1)
        WriteStrFile(topksent_path, st, 'utf-8')
Пример #5
0
def evaluate_all_kg(fp,
                    papers_path,
                    pk_path,
                    modeltype='mymodel',
                    inc_abscon=True,
                    useabstr=1,
                    maxtxt=-1,
                    fname_topk_dict={}):
    print "model type =", modeltype
    # systemdir should be managed accoding to the inc_abscon parameter
    if not inc_abscon:
        system_dir = os.path.join(fp, 'systemPure_1')
        system_mandir = os.path.join(fp, 'systemPure_man1')
    else:
        system_dir = os.path.join(fp, 'systemPure_2')
        system_mandir = os.path.join(fp, 'systemPure_man2')
    if not os.path.exists(system_dir):
        os.makedirs(system_dir)
    if not os.path.exists(system_mandir):
        os.makedirs(system_mandir)

    # -- get the original filename list in the path variable --
    file_list = sxpGetDirFileList(papers_path)[0]

    # -- for each file, get its sxpText and run modeltype ranking model on it --
    for i, file_name in enumerate(file_list):
        # ---- get topk sentence ----
        if not fname_topk_dict.has_key(file_name):
            continue
        else:
            topk = fname_topk_dict[file_name]
        # ---- get file name ----
        fset = file_name.split('.')
        if fset[-1] != 'txt':  # if current file is not a txt file, it is not one of the original papers
            continue
        print "processing", i, "th paper named as", file_name
        # -- get single pickle file directory --
        if not inc_abscon:
            pickle_path = os.path.join(pk_path, file_name + '_1.pk')
        else:
            pickle_path = os.path.join(pk_path, file_name + '_2.pk')
        # -- run ranking model on sxpText object at pickle_path --
        if modeltype == 'mymodel':
            model = MyModel(pickle_path)
        if modeltype == 'tfidf':
            model = TfIdf(pickle_path)
        if modeltype == 'graphb':
            model = GraphBased(pickle_path)
        if modeltype == 'graphw':
            model = WordGraph(pickle_path)
        if modeltype == 'context1':
            model = conTextModel(pickle_path)
        if modeltype == 'mysecmodel':
            model = MySecModel(pickle_path)
        if modeltype == 'myseccontextmodel':
            model = SecConTextModel(pickle_path)
        if modeltype == 'hybrid':  # sxpHybridGraph.py
            model = HybridGraph(pickle_path)
        if modeltype == 'sectitle':  # MySecTitleNetwork.py
            model = MySecTitleModel(pickle_path)

        # -- get the .html file's suffix NO. --
        if modeltype in idname.keys():
            idstr = idname[modeltype]
        # -- save original model ranked topk sentences to text --
        tops = model.OutPutTopKSent(topk, useabstr, maxtxt)
        st = ProduceSystem(tops, file_name, 1)
        topksent_path = os.path.join(system_dir, file_name + '.html.' + idstr)
        WriteStrFile(topksent_path, st, 'utf-8')
        topksent_path = os.path.join(system_mandir,
                                     file_name + '.html.' + idstr)
        WriteStrFile(topksent_path, st, 'utf-8')
    print "ranking complete"