예제 #1
0
args = parser.parse_args(sys.argv[1:])

# load the squad data
data = json.load(open(args.squad_data_path))

# initialize the sif embeddings stuff
(words, We) = em.data_io.getWordmap(args.embd_wordfile_path)
word2weight = em.data_io.getWordWeight(args.embd_weightfile_path, 1e-3)
weight4ind = em.data_io.getWeight(words, word2weight)
embd_params = em.params.params()
embd_params.rmpc = 0
sif_model = (words, We, word2weight, weight4ind, embd_params)

# prepare splitta
splitta_model = sbd.load_sbd_model("../splitta/model_nb/", use_svm=False)

# create the summarizer(we need it only for featurization)
featurizer = Summarizer(sif_model, None, splitta_model)

raw_text_path = tempfile.NamedTemporaryFile().name
query_path = tempfile.NamedTemporaryFile().name

with open(args.feats_file_path, "w") as fw:
    with open(args.labels_file_path, "w") as lw:

        dp_id = 0
        for d in data["data"]:
            dps = []
            # get a squad document
            sens = []
예제 #2
0
파일: mysplit.py 프로젝트: hvn002/lancet
import sbd, util,word_tokenize,os
from sbd import Model
from sbd import NB_Model
from sbd import SVM_Model
from util import Counter
from sbd import Frag
from sbd import Doc


model_path = './splitta/model_svm/';
model = sbd.load_sbd_model(model_path)

test = sbd.get_data("./splitta/sample.txt", tokenize=False)
test.featurize(model)
model.classify(test)
outfile = "123.txt";
f = open(outfile, 'w')

test.segment(use_preds=True, tokenize=False, output=f)
예제 #3
0
파일: text.py 프로젝트: DrDub/icsisumm
 def load_splitta_model(self, path):
     use_svm = False
     if 'svm' in path.lower(): use_svm = True
     self._splitta_model = sbd.load_sbd_model(path, use_svm)
예제 #4
0
# See: http://code.google.com/p/splitta/
#
# Input is a list of XML files on the command line.
# The interview IDs are generated from the filenames.
# The output is Redis database commands on stdout. Text is UTF-8 encoded.
# Before starting, set "next_sentence_number" below!

# Make sure this is greater than the ID of any existing sentences in the database.
next_sentence_number = 1

annotator_id = 'annotators:docsouth'
dataset_id = 'datasets:docsouth'

# Splitta config
import sbd
sbd_model = sbd.load_sbd_model('model_svm/', use_svm=True)

##################################################################
##################################################################

from lxml import etree
import os.path
import sys
from traceback import print_exc

if len(sys.argv) < 2:
    print >> sys.stderr, 'Usage: {0} xml-file ...'.format(sys.argv[0])
    sys.exit(1)

print 'SADD "datasets" "{0}"'.format(dataset_id)
print 'SADD "annotators" "{0}"'.format(annotator_id)
예제 #5
0
# This script assumes that the input is not formatted with
# one sentence per line. It skips suspected SGML markup (any
# line beginning with '<'). All other lines not separated
# by an extra line break are concatenated and then split
# with Splitta 1.03.
#
# cat sample.xml | python split_sentences.py
#
# Courtney Napoles, [email protected]
# 2012-06-29

import sys, sbd, os

model_path = os.path.dirname(sbd.__file__) + '/model_svm/'
model = sbd.load_sbd_model(model_path, True)


def split(lines):
    if len(lines) > 0:
        test = sbd.get_text_data(' '.join(lines), tokenize=True)
        test.featurize(model)
        model.classify(test)
        split_sentences = test.segment(use_preds=True, list_only=True)
        # Splitta will drop the last sentence, apparently at random.
        # We will look for dropped sentences by offset and append them
        # to the split sentence ist. Recursively re-splitting missed
        # sentences does not work because Splitta still will not
        # recognize the dropped sentences
        new_length = len(' '.join(split_sentences))
        old_length = len(' '.join(lines))
예제 #6
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--query-folder",
                        default=os.getenv("QUERY_STR"),
                        type=str,
                        required=False)
    parser.add_argument("--model-path",
                        default=os.getenv("RNNSUM_PATH"),
                        type=str,
                        required=False)
    parser.add_argument("--folder", type=str, required=True)
    parser.add_argument("--results", type=str, required=False, default="")
    parser.add_argument("--length", default=100, type=int)
    parser.add_argument("--summary-dir", required=True, type=str)
    parser.add_argument("--embd-wordfile-path", required=True, type=str)
    parser.add_argument("--embd-weightfile-path", required=True, type=str)
    parser.add_argument("--port", required=True, type=int)
    parser.add_argument("--rescore", required=True, type=str)
    parser.add_argument("--text-similarity",
                        required=False,
                        type=str,
                        default="False")
    parser.add_argument("--embd-similarity",
                        required=False,
                        type=str,
                        default="False")
    parser.add_argument("--portion", required=False, default=None, type=float)
    parser.add_argument("--stopwords",
                        required=False,
                        default="stopwords.txt",
                        type=str)
    parser.add_argument("--gen-image",
                        required=False,
                        type=str,
                        default="True")
    parser.add_argument("--workDir", required=False, type=str, default=".")
    parser.add_argument("--language", required=True, type=str, default="en")
    parser.add_argument("--segment", required=False, type=str, default="True")
    parser.add_argument("--highlight",
                        required=False,
                        type=str,
                        default="None")
    parser.add_argument("--translate-query",
                        required=False,
                        type=str,
                        default="False")
    parser.add_argument("--debug", required=False, type=str, default="False")
    args = parser.parse_args()

    args.rescore = args.rescore == "True"
    args.text_similarity = args.text_similarity == "True"
    args.embd_similarity = args.embd_similarity == "True"
    args.gen_image = args.gen_image == "True"
    args.segment = args.segment == "True"
    args.translate_query = args.translate_query == "True"
    global DEBUG
    DEBUG = args.debug == "True"

    if not os.path.exists(args.summary_dir):
        os.makedirs(args.summary_dir)

    # initialize the sif embeddings stuff
    (words, We) = em.data_io.getWordmap(args.embd_wordfile_path)
    embd_dim = len(We[0])
    word2weight = em.data_io.getWordWeight(args.embd_weightfile_path, 1e-3)
    weight4ind = em.data_io.getWeight(words, word2weight)
    embd_params = em.params.params()
    embd_params.rmpc = 0
    sif_model = (words, We, word2weight, weight4ind, embd_params)

    if args.text_similarity or args.embd_similarity:
        model = SimilarityExtractor(use_text_cosine=args.text_similarity,
                                    use_embd_cosine=args.embd_similarity,
                                    embd_dim=embd_dim)
        print("Not loading torch models, using similarity.")
    else:
        model = torch.load(args.model_path,
                           map_location=lambda storage, loc: storage)

    # stopwords
    stopwords = load_stopwords(args.stopwords)

    # prepare splitta
    splitta_model = sbd.load_sbd_model("../splitta/model_nb/", use_svm=False)

    # create the summarizer
    summarizer = Summarizer(sif_model, model, splitta_model, stopwords,
                            args.segment, args.translate_query)

    # start the server and listen to summarization requests
    serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    serversocket.bind(("", args.port))
    serversocket.listen(5)

    print(
        "Loaded all models successfully, ver:06/20/18_12:00PST, ready to accept requests on %d with rescore=%s, portion=%s,similarity text and embd: %s,%s"
        % (args.port, args.rescore
           == True, args.portion, args.text_similarity, args.embd_similarity))

    temp_out = tempfile.mkdtemp()
    while 1:
        (clientsocket, address) = serversocket.accept()
        data = clientsocket.recv(1000000)
        params = json.loads(str(data, "utf-8"))
        qExpansion = params["qExpansion"]
        qResults = params["qResults"] if "qResults" in params else "None"
        input_paths = get_input_paths(args.folder,
                                      args.results + "/" + qResults,
                                      args.language)

        summary_dir = args.summary_dir + "/" + qExpansion
        os.system("mkdir -p %s" % summary_dir)

        try:
            # go over all the input files and run summarization
            query_path = os.path.join(args.query_folder, qExpansion)
            for input_path, input_path2 in input_paths:
                if DEBUG:
                    print("DEBUG: working on %s and %s" %
                          (input_path, input_path2))
                try:
                    summary = summarizer.summarize_text(input_path,
                                                        input_path2,
                                                        query=query_path,
                                                        portion=args.portion,
                                                        max_length=args.length,
                                                        rescore=args.rescore)
                    output_path = os.path.join(temp_out,
                                               os.path.basename(input_path2))
                    with open(output_path, "w", encoding="utf-8") as fp:
                        fp.write(summary)
                except:
                    traceback.print_exc()
            if args.gen_image:
                summarizer.sum2img(temp_out, query_path, args.highlight)
            os.system("mv %s/* %s/ 2> /dev/null" % (temp_out, summary_dir))
            os.system("chmod -R 777 %s" % summary_dir)
        except:
            traceback.print_exc()
        clientsocket.send(SUMMARIZATION_TRIGGER.encode("utf-8"))
예제 #7
0
import sbd, util, word_tokenize, os
from sbd import Model
from sbd import NB_Model
from sbd import SVM_Model
from util import Counter
from sbd import Frag
from sbd import Doc

model_path = './splitta/model_svm/'
model = sbd.load_sbd_model(model_path)

test = sbd.get_data("./splitta/sample.txt", tokenize=False)
test.featurize(model)
model.classify(test)
outfile = "123.txt"
f = open(outfile, 'w')

test.segment(use_preds=True, tokenize=False, output=f)
예제 #8
0
# This script assumes that the input is not formatted with 
# one sentence per line. It skips suspected SGML markup (any
# line beginning with '<'). All other lines not separated 
# by an extra line break are concatenated and then split 
# with Splitta 1.03.
#
# cat sample.xml | python split_sentences.py
#
# Courtney Napoles, [email protected]
# 2012-06-29

import sys, sbd, os

model_path = os.path.dirname(sbd.__file__)+'/model_svm/'
model = sbd.load_sbd_model(model_path,True)

def split(lines) :
    if len(lines) > 0 :
        test = sbd.get_text_data(' '.join(lines),tokenize=True)
        test.featurize(model)
        model.classify(test)
        split_sentences = test.segment(use_preds=True,list_only=True)
        # Splitta will drop the last sentence, apparently at random.
        # We will look for dropped sentences by offset and append them
        # to the split sentence ist. Recursively re-splitting missed 
        # sentences does not work because Splitta still will not 
        # recognize the dropped sentences
        new_length = len(' '.join(split_sentences))
        old_length = len(' '.join(lines))
        if new_length != old_length :
예제 #9
0
 def load_splitta_model(self, path):
     use_svm = False
     if 'svm' in path.lower(): use_svm = True
     self._splitta_model = sbd.load_sbd_model(path, use_svm)