Exemplo n.º 1
0
def prepare_datasets_for_emma(target, source, env):
    args = source[-1].read()


    try:
        with meta_open(source[0].rstr()) as ifdA:
            dataA = DataSet.from_stream(ifdA)[0]
            analysesA = {w : ", ".join([" ".join(["%s:NULL" % m for m in a]) for a in aa]) for w, aa in dataA.get_analyses().iteritems()}
    except:
        with meta_open(source[0].rstr()) as ifdA:
            analysesA = {w : r for w, r in [l.strip().split("\t") for l in ifdA]}


    try:
        with meta_open(source[1].rstr()) as ifdB:
            dataB = DataSet.from_stream(ifdB)[0]
            analysesB = {w : ", ".join([" ".join(["%s:NULL" % m for m in a]) for a in aa]) for w, aa in dataB.get_analyses().iteritems()}
    except:
        with meta_open(source[1].rstr()) as ifdB:
            analysesB = {w : r for w, r in [l.strip().split("\t") for l in ifdB]}            

            
    wordsA = set(analysesA.keys())
    wordsB = set(analysesB.keys()) 
    common_words = wordsA.intersection(wordsB)
    with meta_open(target[0].rstr(), "w") as ofdA, meta_open(target[1].rstr(), "w") as ofdB:
        for word in common_words:
            ofdA.write(("%s\t%s\n" % (word, analysesA[word])).encode("utf8"))
            ofdB.write(("%s\t%s\n" % (word, analysesB[word])).encode("utf8"))
    return None
Exemplo n.º 2
0
def evaluate_morphology(target, source, env):
    with meta_open(source[0].rstr()) as gold_fd, meta_open(source[1].rstr()) as pred_fd:
        gold = DataSet.from_stream(gold_fd)
        pred = DataSet.from_stream(pred_fd)
        gold_analyses = gold.get_analyses()
        pred_analyses = pred.get_analyses()
    with meta_open(target[0].rstr(), "w") as ofd:
        ofd.write("%f\n" % 1.0)
    return None
Exemplo n.º 3
0
def random_tags(target, source, env):
    num_tags = env["NUM_TAGS"]
    style_name = source[1].read()
    with meta_open(source[0].rstr()) as ifd:
        data = DataSet.from_stream(ifd)[-1]
    if style_name == "type-based":
        wordIndexToTag = {i : randint(0, num_tags) for i in data.indexToWord.keys()}
        new_data = DataSet.from_sentences([[(data.indexToWord[w], str(wordIndexToTag[w]), [data.indexToAnalysis[a] for a in aa]) for w, t, aa in s] for s in data.sentences])
    else:
        new_data = DataSet.from_sentences([[(data.indexToWord[w], str(randint(0, num_tags)), [data.indexToAnalysis[a] for a in aa]) for w, t, aa in s] for s in data.sentences])
    with meta_open(target[0].rstr(), "w") as ofd:
        new_data.write(ofd)
    return None
Exemplo n.º 4
0
def conllish_to_xml(target, source, env):
    with meta_open(source[0].rstr()) as ifd:
        sentences = [[(w, t, []) for w, t in [re.split(r"\s+", x) for x in s.split("\n") if not re.match(r"^\s*$", x)]] for s in re.split(r"\n\n", ifd.read(), flags=re.M)]
    data = DataSet.from_sentences(sentences)
    with meta_open(target[0].rstr(), "w") as ofd:
        data.write(ofd)
    return None
Exemplo n.º 5
0
def add_morphology(target, source, env):
    with meta_open(source[0].rstr()) as ifd:
        data = DataSet.from_stream(ifd)[-1]
    morphology = {}
    with meta_open(source[1].rstr()) as ifd:        
        for l in ifd:
            word, analyses = l.split("\t")
            morphology[word] = set()
            for analysis in analyses.split(", "):
                morphology[word].add(tuple([morph.split(":")[0] for morph in analysis.split() if not morph.startswith("~")]))

    #print [[(data.indexToWord[w], data.indexToTag.get(t, None), morphology.get(data.indexToWord[w], [])) for w, t, aa in s] for s in data.sentences][0:10]
    new_data = DataSet.from_sentences([[(data.indexToWord[w], data.indexToTag.get(t, None), get_without_case(data.indexToWord[w], morphology)) for w, t, aa in s] for s in data.sentences])
    with meta_open(target[0].rstr(), "w") as ofd:
        new_data.write(ofd)
    return None
Exemplo n.º 6
0
def create_subset(target, source, env):
    amount = source[1].read()
    method = source[2].read()
    with meta_open(source[0].rstr()) as ifd:
        data = DataSet.from_stream(ifd)
    subset = data.get_subset(range(amount))
    with meta_open(target[0].rstr(), "w") as ofd:
        subset.write(ofd)
    return None
Exemplo n.º 7
0
def rtm_to_data(target, source, env):
    sentences = []
    with meta_open(source[0].rstr()) as ifd:
        for sentence in ifd:
            words = [w for w in sentence.split()[5:] if w not in ["(())", "IGNORE_TIME_SEGMENT_IN_SCORING"]]
            if len(words) > 0:
                sentences.append(words)
    dataset = DataSet.from_sentences([[(w, None, []) for w in s] for s in sentences])
    with meta_open(target[0].rstr(), "w") as ofd:
        dataset.write(ofd)
    return None
Exemplo n.º 8
0
def morfessor_to_tripartite(target, source, env):
    with meta_open(source[0].rstr()) as ifd:
        data = DataSet.from_stream(ifd)
    for k in data.indexToAnalysis.keys():
        old = [x[1] for x in data.indexToAnalysis[k]]
        sizes = sorted(enumerate([len(x) for x in old]), lambda x, y : cmp(x[1], y[1]))
        stem_index = sizes[-1][0]
        stem = ("stem", old[stem_index])
        prefix = ("prefix", "".join(old[:stem_index]))
        suffix = ("suffix", "".join(old[stem_index + 1:]))        
        data.indexToAnalysis[k] = [({}, x[1]) for x in [prefix, stem, suffix] if x[1] != ""]
    with meta_open(target[0].rstr(), "w") as ofd:
        data.write(ofd)
    return None
Exemplo n.º 9
0
def dataset_to_emma(target, source, env):
    with meta_open(source[0].rstr()) as ifd:
        data = DataSet.from_stream(ifd)[-1]
    with meta_open(target[0].rstr(), "w") as ofd:
        for word, analyses in sorted(data.get_analyses().iteritems()):
            #if not re.match(r".*\W.*", word):
            #    continue
            word = word.lower()
            if len(analyses) == 0:
                x = "%s\t%s:NULL\n" % (word, word) #", ".join([" ".join(["%s:NULL" % m for m in a]) for a in analyses]))
            else:
                x = "%s\t%s\n" % (word, ", ".join([" ".join(["%s:NULL" % m.lower() for m in a]) for a in analyses]))            
            ofd.write(x.encode("utf-8"))
    return None
Exemplo n.º 10
0
def random_segmentations(target, source, env):
    def get_random_segmentation(w):
        stem_length = randint(1, len(w))
        prefix_length = randint(0, len(w) - stem_length)
        suffix_length = randint(0, len(w) - (stem_length + prefix_length))
        prefix = w[:prefix_length]
        stem = w[prefix_length:prefix_length + stem_length]
        suffix = w[prefix_length + stem_length:]
        return tuple([x for x in [prefix, stem, suffix] if len(x) > 0])
    style_name = source[1].read()
    with meta_open(source[0].rstr()) as ifd:
        data = DataSet.from_stream(ifd)[-1]
    if style_name == "type-based":
        wordIndexToAnalysis = {}
        for i, w in data.indexToWord.iteritems():
            wordIndexToAnalysis[i] = get_random_segmentation(w)
        sentences = [[(data.indexToWord[w], data.indexToTag.get(t, None), [wordIndexToAnalysis[w]]) for w, t, aa in s] for s in data.sentences]
    else:        
        sentences = [[(data.indexToWord[w], data.indexToTag.get(t, None), [get_random_segmentation(data.indexToWord[w])]) for w, t, aa in s] for s in data.sentences]
    new_data = DataSet.from_sentences(sentences)
    with meta_open(target[0].rstr(), "w") as ofd:
        new_data.write(ofd)
    return None
Exemplo n.º 11
0
def evaluate_tagging_vm(target, source, env):
    with meta_open(source[0].rstr()) as gold_fd, meta_open(source[1].rstr()) as pred_fd:
        gold = DataSet.from_stream(gold_fd)[-1]
        preds = DataSet.from_stream(pred_fd)
        #assert(len(gold.sentences) == len(pred.sentences))
    scores = []
    for pred in preds:
        for gold_sentence, pred_sentence in zip(gold.sentences, pred.sentences):
            assert(len(gold_sentence) == len(pred_sentence))
        gold_tags = sum([[l[1] for l in s] for s in gold.sentences], [])
        pred_tags = sum([[l[1] for l in s] for s in pred.sentences], [])
        #scores = {}
    #scores["TRand"] = adjusted_rand(gold_tags, pred_tags)
    #scores["TmPur"] = harmonic_mean(modified_purity(gold_tags, pred_tags), modified_purity(pred_tags, gold_tags, 2))
    #scores["TNIS"] = 1.0 - normalized_information_distance(gold_tags, pred_tags)
        scores.append({"VM" : v_measure(pred_tags, gold_tags)})
        #names = scores.keys()
    names = ["VM"]
    with meta_open(target[0].rstr(), "w") as ofd:
        ofd.write("\t".join(names) + "\n")
        for score in scores:
            ofd.write("\t".join(["%.3f" % score[x] for x in names]) + "\n")
    return None
Exemplo n.º 12
0
def top_words_by_tag(target, source, env):
    with meta_open(source[0].rstr()) as ifd:
        data = DataSet.from_stream(ifd)[-1]
    counts = numpy.zeros(shape=(len(data.indexToWord), len(data.indexToTag)))
    for sentence in data.sentences:
        for w, t, aa in sentence:
            counts[w, t] += 1
    tag_totals = counts.sum(0)
    word_totals = counts.sum(1)
    keep = 10
    with meta_open(target[0].rstr(), "w") as ofd:
        for tag_id, tag_total in enumerate(tag_totals):
            word_counts = counts[:, tag_id] #.argsort()
            indices = [(i, word_counts[i]) for i in reversed(word_counts.argsort())][0:keep]
            ofd.write(" ".join(["%s-%.2f-%.2f" % (data.indexToWord[i], float(c) / tag_total, float(c) / word_totals[i]) for i, c in indices]) + "\n")
    return None
Exemplo n.º 13
0
 def actual_filter(target, source, env):
     with meta_open(source[0].rstr()) as ifd:
         if "xml" in source[0].rstr():
             words = sum([x.indexToWord.values() for x in DataSet.from_stream(ifd)], [])
         else:
             words = sum([[x for x in re.split(r"\s+", l.strip()) if not re.match(r".*\d.*", x)] for l in ifd], [])
     good, bad = split_words(target, source, env, words)
     try:
         with open(target[0].rstr(), "w") as good_ofd, open(target[1].rstr(), "w") as bad_ofd:
             good_ofd.write("\n".join(sorted(good)).encode("utf-8"))
             bad_ofd.write("\n".join(sorted(bad)).encode("utf-8"))
     except:
         with meta_open(target[0].rstr(), "w") as good_ofd, meta_open(target[1].rstr(), "w") as bad_ofd:
             good_ofd.write("\n".join(sorted(good)))
             bad_ofd.write("\n".join(sorted(bad)))
     return None
Exemplo n.º 14
0
def oov_reduction(target, source, env):
    """
    split expansions into buckets
    for each bucket 0 to N, output line of format:
      OOV_TOTAL_TOKENS, OOV_TOTAL_TYPES, BUCKET_TOTAL_TOKENS, BUCKET_TOTAL_TYPES, BUCKET_TOTAL_OOV_TOKENS, BUCKET_TOTAL_OOV_TYPES, BUCKET_ACCEPTED_TYPES
    """
    if len(source) == 4:
        bucket_size = 1000
    else:
        bucket_size = source[4].read()
    training_fname, expansion_fname, oov_fname, accepted_fname = [x.rstr() for x in source[0:4]]
    with meta_open(training_fname) as training_ifd, meta_open(expansion_fname) as expansion_ifd, meta_open(oov_fname) as oov_ifd, meta_open(accepted_fname) as accepted_ifd:
        training = set(DataSet.from_stream(training_ifd)[-1].indexToWord.values())
        expansion = [(w, math.exp(-float(lp))) for w, lp in [l.split() for l in expansion_ifd] if w not in training]
        oov = {w : int(c) for c, w in [l.strip().split() for l in oov_ifd if w not in training]}
        accepted = set([x.strip() for x in accepted_ifd])
    total_prob = sum([x[1] for x in expansion])
    expansion = [(w, p / total_prob) for w, p in expansion]
    values = [[0, 0, 0, 0, 0]]
    buckets = len(expansion) / bucket_size
    #oov = {w : c for w, c in oov.iteritems() if w not in training}
    total_oov_tokens = sum(oov.values())
    total_oov_types = len(oov)    
    for bucket in range(buckets):
        bucket_total_tokens, bucket_total_types, bucket_total_oov_tokens, bucket_total_oov_types, accepted_types = values[bucket]
        for w, p in expansion[bucket * bucket_size : (bucket + 1) * bucket_size]:
            predicted_oov_count = total_oov_tokens * p
            bucket_total_tokens += predicted_oov_count
            bucket_total_types += 1
            if w in accepted:
                accepted_types += 1
            if w in oov:
                bucket_total_oov_tokens += oov[w]
                bucket_total_oov_types += 1
        values.append([bucket_total_tokens, bucket_total_types, bucket_total_oov_tokens, bucket_total_oov_types, accepted_types])
    with meta_open(target[0].rstr(), "w") as ofd:
        ofd.write("\t".join(["Total OOV Tokens", "Total OOV Types", "Bucket Total Types", "Bucket OOV Tokens", "Bucket Total OOV Types", "Bucket Accepted Types"]) + "\n")
        ofd.write("\n".join(["\t".join([str(int(y)) for y in [total_oov_tokens, total_oov_types] + x]) for x in values[1:]]) + "\n")
    return None
Exemplo n.º 15
0
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", dest="input")
parser.add_argument("-o", "--output", dest="output")
options = parser.parse_args()

sizes = {"train" : 50000, 
         "dev" : 8000, 
         "test" :8000,
         #"small_eval" : 100000,
         }

indices = {}

with meta_open(options.input) as ifd:
    data = DataSet.from_stream(ifd)
    start = 0
    end = 0
    for name, size in sizes.iteritems():        
        total_words = 0
        while total_words < size:
            total_words += len(data.sentences[end])
            end += 1
        indices[name] = (start, end)
        start = end
    indices["big_eval"] = (end, len(data.sentences))
        
for name in ["train", "dev", "test"]:
    start, end = indices[name]
    with meta_open(os.path.join(options.output, "pos", "%s.pos" % name), "w") as ofd:
        text = "\n\n".join(["\n".join(["%s\t%s" % (data.indexToWord[w].lower(), data.indexToTag[t]) for w, t, aa in s]) for s in data.sentences[start:end]])