args = parser.parse_args(sys.argv[1:]) # load the squad data data = json.load(open(args.squad_data_path)) # initialize the sif embeddings stuff (words, We) = em.data_io.getWordmap(args.embd_wordfile_path) word2weight = em.data_io.getWordWeight(args.embd_weightfile_path, 1e-3) weight4ind = em.data_io.getWeight(words, word2weight) embd_params = em.params.params() embd_params.rmpc = 0 sif_model = (words, We, word2weight, weight4ind, embd_params) # prepare splitta splitta_model = sbd.load_sbd_model("../splitta/model_nb/", use_svm=False) # create the summarizer(we need it only for featurization) featurizer = Summarizer(sif_model, None, splitta_model) raw_text_path = tempfile.NamedTemporaryFile().name query_path = tempfile.NamedTemporaryFile().name with open(args.feats_file_path, "w") as fw: with open(args.labels_file_path, "w") as lw: dp_id = 0 for d in data["data"]: dps = [] # get a squad document sens = []
import sbd, util,word_tokenize,os from sbd import Model from sbd import NB_Model from sbd import SVM_Model from util import Counter from sbd import Frag from sbd import Doc model_path = './splitta/model_svm/'; model = sbd.load_sbd_model(model_path) test = sbd.get_data("./splitta/sample.txt", tokenize=False) test.featurize(model) model.classify(test) outfile = "123.txt"; f = open(outfile, 'w') test.segment(use_preds=True, tokenize=False, output=f)
def load_splitta_model(self, path): use_svm = False if 'svm' in path.lower(): use_svm = True self._splitta_model = sbd.load_sbd_model(path, use_svm)
# See: http://code.google.com/p/splitta/ # # Input is a list of XML files on the command line. # The interview IDs are generated from the filenames. # The output is Redis database commands on stdout. Text is UTF-8 encoded. # Before starting, set "next_sentence_number" below! # Make sure this is greater than the ID of any existing sentences in the database. next_sentence_number = 1 annotator_id = 'annotators:docsouth' dataset_id = 'datasets:docsouth' # Splitta config import sbd sbd_model = sbd.load_sbd_model('model_svm/', use_svm=True) ################################################################## ################################################################## from lxml import etree import os.path import sys from traceback import print_exc if len(sys.argv) < 2: print >> sys.stderr, 'Usage: {0} xml-file ...'.format(sys.argv[0]) sys.exit(1) print 'SADD "datasets" "{0}"'.format(dataset_id) print 'SADD "annotators" "{0}"'.format(annotator_id)
# This script assumes that the input is not formatted with # one sentence per line. It skips suspected SGML markup (any # line beginning with '<'). All other lines not separated # by an extra line break are concatenated and then split # with Splitta 1.03. # # cat sample.xml | python split_sentences.py # # Courtney Napoles, [email protected] # 2012-06-29 import sys, sbd, os model_path = os.path.dirname(sbd.__file__) + '/model_svm/' model = sbd.load_sbd_model(model_path, True) def split(lines): if len(lines) > 0: test = sbd.get_text_data(' '.join(lines), tokenize=True) test.featurize(model) model.classify(test) split_sentences = test.segment(use_preds=True, list_only=True) # Splitta will drop the last sentence, apparently at random. # We will look for dropped sentences by offset and append them # to the split sentence ist. Recursively re-splitting missed # sentences does not work because Splitta still will not # recognize the dropped sentences new_length = len(' '.join(split_sentences)) old_length = len(' '.join(lines))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--query-folder", default=os.getenv("QUERY_STR"), type=str, required=False) parser.add_argument("--model-path", default=os.getenv("RNNSUM_PATH"), type=str, required=False) parser.add_argument("--folder", type=str, required=True) parser.add_argument("--results", type=str, required=False, default="") parser.add_argument("--length", default=100, type=int) parser.add_argument("--summary-dir", required=True, type=str) parser.add_argument("--embd-wordfile-path", required=True, type=str) parser.add_argument("--embd-weightfile-path", required=True, type=str) parser.add_argument("--port", required=True, type=int) parser.add_argument("--rescore", required=True, type=str) parser.add_argument("--text-similarity", required=False, type=str, default="False") parser.add_argument("--embd-similarity", required=False, type=str, default="False") parser.add_argument("--portion", required=False, default=None, type=float) parser.add_argument("--stopwords", required=False, default="stopwords.txt", type=str) parser.add_argument("--gen-image", required=False, type=str, default="True") parser.add_argument("--workDir", required=False, type=str, default=".") parser.add_argument("--language", required=True, type=str, default="en") parser.add_argument("--segment", required=False, type=str, default="True") parser.add_argument("--highlight", required=False, type=str, default="None") parser.add_argument("--translate-query", required=False, type=str, default="False") parser.add_argument("--debug", required=False, type=str, default="False") args = parser.parse_args() args.rescore = args.rescore == "True" args.text_similarity = args.text_similarity == "True" args.embd_similarity = args.embd_similarity == "True" args.gen_image = args.gen_image == "True" args.segment = args.segment == "True" args.translate_query = args.translate_query == "True" global DEBUG DEBUG = args.debug == "True" if not os.path.exists(args.summary_dir): os.makedirs(args.summary_dir) # initialize the sif embeddings stuff (words, We) = em.data_io.getWordmap(args.embd_wordfile_path) embd_dim = len(We[0]) word2weight = em.data_io.getWordWeight(args.embd_weightfile_path, 1e-3) weight4ind = em.data_io.getWeight(words, word2weight) embd_params = em.params.params() embd_params.rmpc = 0 sif_model = (words, We, word2weight, weight4ind, embd_params) if args.text_similarity or args.embd_similarity: model = SimilarityExtractor(use_text_cosine=args.text_similarity, use_embd_cosine=args.embd_similarity, embd_dim=embd_dim) print("Not loading torch models, using similarity.") else: model = torch.load(args.model_path, map_location=lambda storage, loc: storage) # stopwords stopwords = load_stopwords(args.stopwords) # prepare splitta splitta_model = sbd.load_sbd_model("../splitta/model_nb/", use_svm=False) # create the summarizer summarizer = Summarizer(sif_model, model, splitta_model, stopwords, args.segment, args.translate_query) # start the server and listen to summarization requests serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) serversocket.bind(("", args.port)) serversocket.listen(5) print( "Loaded all models successfully, ver:06/20/18_12:00PST, ready to accept requests on %d with rescore=%s, portion=%s,similarity text and embd: %s,%s" % (args.port, args.rescore == True, args.portion, args.text_similarity, args.embd_similarity)) temp_out = tempfile.mkdtemp() while 1: (clientsocket, address) = serversocket.accept() data = clientsocket.recv(1000000) params = json.loads(str(data, "utf-8")) qExpansion = params["qExpansion"] qResults = params["qResults"] if "qResults" in params else "None" input_paths = get_input_paths(args.folder, args.results + "/" + qResults, args.language) summary_dir = args.summary_dir + "/" + qExpansion os.system("mkdir -p %s" % summary_dir) try: # go over all the input files and run summarization query_path = os.path.join(args.query_folder, qExpansion) for input_path, input_path2 in input_paths: if DEBUG: print("DEBUG: working on %s and %s" % (input_path, input_path2)) try: summary = summarizer.summarize_text(input_path, input_path2, query=query_path, portion=args.portion, max_length=args.length, rescore=args.rescore) output_path = os.path.join(temp_out, os.path.basename(input_path2)) with open(output_path, "w", encoding="utf-8") as fp: fp.write(summary) except: traceback.print_exc() if args.gen_image: summarizer.sum2img(temp_out, query_path, args.highlight) os.system("mv %s/* %s/ 2> /dev/null" % (temp_out, summary_dir)) os.system("chmod -R 777 %s" % summary_dir) except: traceback.print_exc() clientsocket.send(SUMMARIZATION_TRIGGER.encode("utf-8"))
import sbd, util, word_tokenize, os from sbd import Model from sbd import NB_Model from sbd import SVM_Model from util import Counter from sbd import Frag from sbd import Doc model_path = './splitta/model_svm/' model = sbd.load_sbd_model(model_path) test = sbd.get_data("./splitta/sample.txt", tokenize=False) test.featurize(model) model.classify(test) outfile = "123.txt" f = open(outfile, 'w') test.segment(use_preds=True, tokenize=False, output=f)
# This script assumes that the input is not formatted with # one sentence per line. It skips suspected SGML markup (any # line beginning with '<'). All other lines not separated # by an extra line break are concatenated and then split # with Splitta 1.03. # # cat sample.xml | python split_sentences.py # # Courtney Napoles, [email protected] # 2012-06-29 import sys, sbd, os model_path = os.path.dirname(sbd.__file__)+'/model_svm/' model = sbd.load_sbd_model(model_path,True) def split(lines) : if len(lines) > 0 : test = sbd.get_text_data(' '.join(lines),tokenize=True) test.featurize(model) model.classify(test) split_sentences = test.segment(use_preds=True,list_only=True) # Splitta will drop the last sentence, apparently at random. # We will look for dropped sentences by offset and append them # to the split sentence ist. Recursively re-splitting missed # sentences does not work because Splitta still will not # recognize the dropped sentences new_length = len(' '.join(split_sentences)) old_length = len(' '.join(lines)) if new_length != old_length :