Пример #1
0
def labeled(srf, orig_best=None, orig_docset_id=None):
    output = []
    n = 0
    curr = []
    last_docset_id = None
    for source in os.listdir(srf):
        base = ".".join(source.split(".")[:-1])
        embedding = os.path.join(srf, base + ".emd")
        if source.endswith(".nrm") and os.path.exists(embedding):
            doc_id = source.split(".")[0]
            sentences = utils.fileaslist(os.path.join(srf, source))
            embeddings = [[float(y) for y in x.split(" ")]
                          for x in utils.fileaslist(embedding)]
            if parse_rouge(os.path.join(srf, base + ".rge"), 2) < min_rge:
                continue
            best = orig_best if orig_best else set(
                utils.fileaslist(os.path.join(srf, "%s.best%d" % (base, ver))))
            docset_id = orig_docset_id if orig_docset_id else doc_id
            if docset_id != last_docset_id:
                if len(curr) > 0: output.append(curr)
                curr = []
            last_docset_id = docset_id
            for i, sen in enumerate(sentences):
                d = dict()
                n += 1
                d["docset_id"] = docset_id
                d["doc_id"] = doc_id
                d["sentence_id"] = n if orig_best else str(i + 1)
                d["embedding"] = embeddings[i]
                d["label"] = 1 if sen in best else 0
                d["text"] = sentences[i]
                if len(d) > 0: curr.append(d)
    if len(curr) > 0: output.append(curr)
    return output
Пример #2
0
def get_input_paths(folder, qResults, language):
    paths = []
    if os.path.isfile(qResults):
        with open(qResults) as r:
            results = json.load(r)
            for res in results["document info"]["results"]:
                index = res["index"]
                index_toks = index.replace("index_store",
                                           "mt_store").split("/")
                filename = res["filename"]
                ep = "%s/%s/%s/%s.txt" % (folder, "/".join(
                    index_toks[:5]), index_toks[-2], filename)
                if language == "en":
                    paths.append((ep, ep))
                else:
                    # check that the correct laguage was selected in the server
                    assert (language == "sw" and "1A/" in index
                            or language == "tl" and "1B/" in index)
                    input_name = tempfile.NamedTemporaryFile().name
                    index_toks = index.replace("index_store",
                                               "morphology_store").split("/")
                    morpho_store = "%s/%s" % (folder, "/".join(index_toks[:5]))
                    if DEBUG:
                        print("looking in morpho store: %s" % morpho_store)
                    morpho_ver = list(
                        filter(
                            lambda x: "morph-v3.0" in x.name and
                            ("v4.0" in x.name or "audio" not in ep),
                            sorted(Path(morpho_store).iterdir(),
                                   key=lambda f: f.stat().st_mtime)))[-1].name
                    #list(filter(lambda x: "morph-v3.0" in x, os.listdir(morpho_store)))[0]
                    input_file = "%s/%s/%s.txt" % (morpho_store, morpho_ver,
                                                   filename)
                    with open(input_name, "w") as w:
                        with open(input_file) as r:
                            for line in r:
                                d = json.loads(line)
                                if len(d) > 0:
                                    w.write(" ".join(
                                        map(lambda x: x["word"], d[0])) + "\n")
                                else:
                                    w.write("empty.\n")
                    paths.append((input_name, ep))
                    if (len(fileaslist(input_name)) != len(
                            list(filter(lambda x: len(x) > 0,
                                        fileaslist(ep))))):
                        if DEBUG:
                            print("DEBUG: diff sizes %s vs %s" %
                                  (input_file, ep))
    else:
        for path in os.listdir(folder):
            p = "%s/%s" % (folder, path)
            paths.append((p, p))
    return paths
Пример #3
0
def kl(source_path):
    def dist(text):
        words = text.split(" ")
        d = defaultdict(int)
        for w in words:
            d[w] += 1.0 / len(words)
        return d

    sentences = utils.fileaslist(source_path)
    D_dist = dist(" ".join(sentences))
    best = []
    for j in range(FIRST_N_LINES):
        min_dist = 100000
        best_sen = None
        for s in sentences:
            if s in best: pass
            candidate = best + [s]
            S_dist = dist(" ".join(candidate))
            distance = 0.0
            for w in s.split(" "):
                distance += -S_dist[w] * math.log(
                    D_dist[w] / (S_dist[w] + 0.00000000001), 2.0)
            if distance < min_dist:
                min_dist = distance
                best_sen = s
        if best_sen:
            best.append(best_sen)
    return "\n".join(best)
Пример #4
0
    def sum2img(self, summary_dir, query_path, highlight):
        # get weights
        query_embd, _ = self.get_query_embd(query_path)
        weights_dir = tempfile.mkdtemp()
        try:
            for summary_fn in os.listdir(summary_dir):
                weights = []
                summary_path = os.path.join(summary_dir, summary_fn)
                for sen in fileaslist(summary_path):
                    sen_weights = []
                    for word in sen.split(" "):
                        word_embd = self.embed_word(word)
                        weight = 0.0 if word.lower(
                        ) in self.stopwords else cossim_weight(
                            word_embd, query_embd)
                        assert (weight >= 0.0 and weight <= 1.0)
                        sen_weights.append(weight)
                    weights.append([str(w) for w in sen_weights])
                write2file(
                    "\n".join(
                        [" ".join([str(w) for w in ws]) for ws in weights]),
                    os.path.join(weights_dir, summary_fn))

            # gen image
            os.system("./gen_images.sh %s %s %s %s" %
                      (summary_dir, weights_dir, summary_dir, highlight))
        finally:
            os.system("rm -r %s" % weights_dir)
Пример #5
0
    def get_embds(self, norm_text_path, query_path):

        # deal with the text
        out_f = tempfile.NamedTemporaryFile()
        em.print_embeddings(
            em.get_embeddings(self.em[0], self.em[1], self.em[2], self.em[3],
                              norm_text_path, self.em[4]), out_f.name)
        sen_embds = [[float(x) for x in line.split(" ")]
                     for line in fileaslist(out_f.name)]
        qry_embds, query = self.get_query_embd(query_path)
        return sen_embds, qry_embds, query
Пример #6
0
def centroidemd(source_path):
    get_embds = lambda path: [[float(y) for y in x.split(" ")]
                              for x in utils.fileaslist(path[:-3] + "emd")]
    if source_path not in cache:
        source_embds = get_embds(source_path)
        cache[source_path] = utils.average(source_embds)

    source_sens = utils.fileaslist(source_path)
    centroid = cache[source_path]

    assert len(source_sens) == len(source_embds)

    best = set()
    for j in range(FIRST_N_LINES):
        try:
            best.add(
                max(set(range(len(source_embds))) - best,
                    key=lambda i: utils.cosine_similarity(
                        source_embds[i], centroid)))
        except ValueError:
            print "too small text"

    return "\n".join([source_sens[i] for i in best])
Пример #7
0
 def split2sens(self, raw_text_path):
     out_file_name = tempfile.NamedTemporaryFile().name
     if self.segment:
         with open(out_file_name, "w", encoding="utf-8") as out_f:
             test = sbd.get_data(raw_text_path, tokenize=True)
             test.featurize(self.splitta_model, verbose=False)
             self.splitta_model.classify(test, verbose=False)
             test.segment(use_preds=True, tokenize=False, output=out_f)
     else:
         with open(out_file_name, "w") as w:
             for line in fileaslist(raw_text_path):
                 line = line.strip()
                 if len(line) > 0: w.write(line + "\n")
     return out_file_name
Пример #8
0
 def get_query_embd(self, query_path):
     # extract the query from the query_path
     with open(query_path, encoding="utf-8") as qr:
         query_dict = json.load(qr)
     query = query_dict["parsed_query"][0][
         "content"] if not self.translate_query else get_translated_query(
             query_dict)
     # deal with query
     qin_f = tempfile.NamedTemporaryFile()
     write2file(wt.normalize(query), qin_f.name)
     qout_f = tempfile.NamedTemporaryFile()
     em.print_embeddings(
         em.get_embeddings(self.em[0], self.em[1], self.em[2], self.em[3],
                           qin_f.name, self.em[4]), qout_f.name)
     qry_embds = [float(x) for x in fileaslist(qout_f.name)[0].split(" ")]
     return qry_embds, query
Пример #9
0
def runformds(datapoint_folder):
    dsf = dataset_folder
    dpf = datapoint_folder
    srf = os.path.join(dsf, dpf, "sources")
    docset_id = datapoint_folder
    max_rouge = -1.0
    best_source = None
    for source in os.listdir(srf):
        if source.endswith(".rge"):
            score = parse_rouge(os.path.join(srf, source), 1)
            if score > max_rouge:
                max_rouge = score
                best_source = ".".join(source.split(".")[:-1])
    if not best_source: return []
    best = set(
        utils.fileaslist(os.path.join(srf, "%s.best%d" % (best_source, ver))))
    return labeled(srf, orig_best=best, orig_docset_id=docset_id)
Пример #10
0
    def ingest_text(self, raw_text_path, out_text_path, query_path):
        sens_text_path = self.split2sens(raw_text_path)
        sens_text_path2 = self.split2sens(out_text_path)
        norm_text_path = self.normalize(sens_text_path)
        sen_embds, qry_embds, query = self.get_embds(norm_text_path,
                                                     query_path)

        assert (len(fileaslist(sens_text_path)) == len(
            fileaslist(norm_text_path)))
        if DEBUG:
            print("compare sizes: %d - %d" % (len(
                fileaslist(sens_text_path2)), len(fileaslist(norm_text_path))))
        assert (len(fileaslist(sens_text_path2)) == len(
            fileaslist(norm_text_path)))

        clean_texts = fileaslist(sens_text_path2)
        sent_tokens = [sen.split(" ") for sen in fileaslist(norm_text_path)]
        return get_inputs_metadata(sent_tokens,
                                   clean_texts,
                                   sen_embds,
                                   qry_embds,
                                   query=query)
Пример #11
0
def get_embeddings(json_input_file):
    code = abs(hash(json_input_file))
    em.create_embeddings(json_input_file, "%s/mds/%d.norm" % (TMP, code),
                         "%s/mds/%d.emb" % (TMP, code), models)
    return [[float(n) for n in l.split(" ")]
            for l in utils.fileaslist("%s/mds/%d.emb" % (TMP, code))]
Пример #12
0
            dps = []
            # get a squad document
            sens = []
            embeddings = []
            for p in d["paragraphs"][:30]:
                write2file(p["context"], raw_text_path)
                sens_text_path = featurizer.split2sens(raw_text_path)
                norm_text_path = featurizer.normalize(sens_text_path)
                q = random.choice(p["qas"])
                query = q["question"]
                write2file(query, query_path)
                sen_embds, qry_embds, _ = featurizer.get_embds(
                    norm_text_path, query_path)
                dps.append((query, len(sens), len(sen_embds), qry_embds))
                embeddings.extend(sen_embds)
                sens.extend(fileaslist(sens_text_path))

            # write dps for the document
            inputs = []
            for i, sen in enumerate(sens):
                inpt = dict()
                inpt["sentence_id"] = i
                inpt["text"] = sen
                inpt["embedding"] = embeddings[i]
                inpt["word_count"] = len(sen.split(" "))
                inputs.append(inpt)

            for query, st, cnt, qry_embds in dps:
                dp = dict()
                dp["inputs"] = inputs
                dp["qembedding"] = qry_embds
Пример #13
0
def first3(text_path):
    return "\n".join(utils.fileaslist(text_path)[:FIRST_N_LINES])
Пример #14
0
def rand3(text_path):
    l = utils.fileaslist(text_path)
    return "\n".join(random.sample(l, min(FIRST_N_LINES, len(l))))
Пример #15
0
import sys, os
import utils

sys.path.append("../rouge-scripts")
import rouge as rge

dataset = sys.argv[1]
sample = int(sys.argv[2])

for d in os.listdir(dataset)[:sample]:
    ref = utils.fileaslist(os.path.join(dataset, d, "content.txt.nrm"))
    ref = [x for x in ref if len(x.split(" ")) > 3][:3]
    best_score = 0.0
    best_f = None
    for f in os.listdir(os.path.join(dataset, d, "sources")):
        if f.endswith(".rge"):
            score = rge.parse_rouge(os.path.join(dataset, d, "sources", f), 1)
            if score > best_score:
                best_score = score
                best_f = f
    best = utils.fileaslist(
        os.path.join(dataset, d, "sources", best_f[:-3] + "best2"))
    print "=================================================="
    print "\n".join(ref)
    print "--------------------------------------------------"
    print "\n".join(best)
    print "=================================================="
Пример #16
0
                if score > max_score:
                    max_score = score
                    best = s
        if best:
            base = ".".join(best.split(".")[:-1])
            s = base + ".best" + ver
            text_path = os.path.join(sources, base + ".nrm")
            cont_path = os.path.join(dataset, dp, "content.txt.nrm")
            if os.path.exists(os.path.join(sources,
                                           s)) and os.path.exists(text_path):
                if rge.parse_rouge(os.path.join(sources, base + ".rge"),
                                   2) < min_rge:
                    continue
                can_text = candidate(text_path)
                ref_text = "\n".join([
                    x for x in utils.fileaslist(cont_path)
                    if len(x.split(" ")) > 3
                ][:FIRST_N_LINES])
                can_path = "/tmp/mds/%s.can.txt" % base
                ref_path = "/tmp/mds/%s.ref.txt" % base
                utils.write2file(can_text, can_path)
                utils.write2file(ref_text, ref_path)
                eval_writer.write("%s %s\n" % (can_path, ref_path))

    eval_writer.close()
    print "created the evaluation file, running rouge..."

    os.chdir(rouge_dir)
    rge.rouge(1000, eval_path, eval_out)

    print "done."
Пример #17
0
import os, sys, utils

input_dir = sys.argv[1]

for f in os.listdir(input_dir):
    text = utils.fileaslist("%s/%s" % (input_dir, f))
    clean = []
    for sen in text:
        if len(sen.split(" ")) >= 4:
            clean.append(sen)
    if len(text) > len(clean): print "%d => %d" % (len(text), len(clean))
    utils.write2file("\n".join(clean), "%s/%s" % (input_dir, f))
Пример #18
0
outputs_dir = sys.argv[5]

os.system("mkdir -p %s" % (outputs_dir))

n = 1000
print("number of shards: %d, start doc %d, end doc %d" % (n, start, end))

data_path = "%s/data%d-%d.txt" % (outputs_dir, start, end)

os.system("> %s" % data_path)
doc = start
d = start / n
if doc != 0: d += 1
with open(data_path, "w") as w:
    while doc <= end:
        if doc % n == 0: d += 1
        os.system("cp %s/%d/%d.query /tmp/yan/query/queries.txt" %
                  (inputs_path, d, doc))
        os.system("cp %s/%d/%d.txt /tmp/yan/inputs/input.txt" %
                  (inputs_path, d, doc))
        run(port)
        summary_sens = fileaslist("/tmp/yan/outputs/input.txt")
        newdoc = " ".join(summary_sens)
        if "\n" in newdoc: raise Exception("new line in the summary!")
        if len(summary_sens) > 0:
            w.write(newdoc + "\n")
        else:
            w.write("\n")
        print("done with document %d" % doc)
        doc += 1
Пример #19
0
 def normalize(self, sens_path):
     out_file_name = tempfile.NamedTemporaryFile().name
     write2file(
         "\n".join([wt.normalize(line) for line in fileaslist(sens_path)]),
         out_file_name)
     return out_file_name
Пример #20
0
import sys,os
sys.path.append("../training")
from utils import fileaslist

d1 = sys.argv[1]
d2 = sys.argv[2]

s = 0.0
n = 0
for f1 in os.listdir(d1):
  pred1 = set(fileaslist("%s/%s" % (d1,f1)))
  pred2 = set(fileaslist("%s/%s" % (d2,f1)))
  score = float(len(pred1.intersection(pred2))) / len(pred1.union(pred2))
  s += score
  n += 1

print("avg similarity: %f" % (s / n))