def prepare_datasets_for_emma(target, source, env): args = source[-1].read() try: with meta_open(source[0].rstr()) as ifdA: dataA = DataSet.from_stream(ifdA)[0] analysesA = {w : ", ".join([" ".join(["%s:NULL" % m for m in a]) for a in aa]) for w, aa in dataA.get_analyses().iteritems()} except: with meta_open(source[0].rstr()) as ifdA: analysesA = {w : r for w, r in [l.strip().split("\t") for l in ifdA]} try: with meta_open(source[1].rstr()) as ifdB: dataB = DataSet.from_stream(ifdB)[0] analysesB = {w : ", ".join([" ".join(["%s:NULL" % m for m in a]) for a in aa]) for w, aa in dataB.get_analyses().iteritems()} except: with meta_open(source[1].rstr()) as ifdB: analysesB = {w : r for w, r in [l.strip().split("\t") for l in ifdB]} wordsA = set(analysesA.keys()) wordsB = set(analysesB.keys()) common_words = wordsA.intersection(wordsB) with meta_open(target[0].rstr(), "w") as ofdA, meta_open(target[1].rstr(), "w") as ofdB: for word in common_words: ofdA.write(("%s\t%s\n" % (word, analysesA[word])).encode("utf8")) ofdB.write(("%s\t%s\n" % (word, analysesB[word])).encode("utf8")) return None
def evaluate_morphology(target, source, env): with meta_open(source[0].rstr()) as gold_fd, meta_open(source[1].rstr()) as pred_fd: gold = DataSet.from_stream(gold_fd) pred = DataSet.from_stream(pred_fd) gold_analyses = gold.get_analyses() pred_analyses = pred.get_analyses() with meta_open(target[0].rstr(), "w") as ofd: ofd.write("%f\n" % 1.0) return None
def random_tags(target, source, env): num_tags = env["NUM_TAGS"] style_name = source[1].read() with meta_open(source[0].rstr()) as ifd: data = DataSet.from_stream(ifd)[-1] if style_name == "type-based": wordIndexToTag = {i : randint(0, num_tags) for i in data.indexToWord.keys()} new_data = DataSet.from_sentences([[(data.indexToWord[w], str(wordIndexToTag[w]), [data.indexToAnalysis[a] for a in aa]) for w, t, aa in s] for s in data.sentences]) else: new_data = DataSet.from_sentences([[(data.indexToWord[w], str(randint(0, num_tags)), [data.indexToAnalysis[a] for a in aa]) for w, t, aa in s] for s in data.sentences]) with meta_open(target[0].rstr(), "w") as ofd: new_data.write(ofd) return None
def conllish_to_xml(target, source, env): with meta_open(source[0].rstr()) as ifd: sentences = [[(w, t, []) for w, t in [re.split(r"\s+", x) for x in s.split("\n") if not re.match(r"^\s*$", x)]] for s in re.split(r"\n\n", ifd.read(), flags=re.M)] data = DataSet.from_sentences(sentences) with meta_open(target[0].rstr(), "w") as ofd: data.write(ofd) return None
def add_morphology(target, source, env): with meta_open(source[0].rstr()) as ifd: data = DataSet.from_stream(ifd)[-1] morphology = {} with meta_open(source[1].rstr()) as ifd: for l in ifd: word, analyses = l.split("\t") morphology[word] = set() for analysis in analyses.split(", "): morphology[word].add(tuple([morph.split(":")[0] for morph in analysis.split() if not morph.startswith("~")])) #print [[(data.indexToWord[w], data.indexToTag.get(t, None), morphology.get(data.indexToWord[w], [])) for w, t, aa in s] for s in data.sentences][0:10] new_data = DataSet.from_sentences([[(data.indexToWord[w], data.indexToTag.get(t, None), get_without_case(data.indexToWord[w], morphology)) for w, t, aa in s] for s in data.sentences]) with meta_open(target[0].rstr(), "w") as ofd: new_data.write(ofd) return None
def create_subset(target, source, env): amount = source[1].read() method = source[2].read() with meta_open(source[0].rstr()) as ifd: data = DataSet.from_stream(ifd) subset = data.get_subset(range(amount)) with meta_open(target[0].rstr(), "w") as ofd: subset.write(ofd) return None
def rtm_to_data(target, source, env): sentences = [] with meta_open(source[0].rstr()) as ifd: for sentence in ifd: words = [w for w in sentence.split()[5:] if w not in ["(())", "IGNORE_TIME_SEGMENT_IN_SCORING"]] if len(words) > 0: sentences.append(words) dataset = DataSet.from_sentences([[(w, None, []) for w in s] for s in sentences]) with meta_open(target[0].rstr(), "w") as ofd: dataset.write(ofd) return None
def morfessor_to_tripartite(target, source, env): with meta_open(source[0].rstr()) as ifd: data = DataSet.from_stream(ifd) for k in data.indexToAnalysis.keys(): old = [x[1] for x in data.indexToAnalysis[k]] sizes = sorted(enumerate([len(x) for x in old]), lambda x, y : cmp(x[1], y[1])) stem_index = sizes[-1][0] stem = ("stem", old[stem_index]) prefix = ("prefix", "".join(old[:stem_index])) suffix = ("suffix", "".join(old[stem_index + 1:])) data.indexToAnalysis[k] = [({}, x[1]) for x in [prefix, stem, suffix] if x[1] != ""] with meta_open(target[0].rstr(), "w") as ofd: data.write(ofd) return None
def dataset_to_emma(target, source, env): with meta_open(source[0].rstr()) as ifd: data = DataSet.from_stream(ifd)[-1] with meta_open(target[0].rstr(), "w") as ofd: for word, analyses in sorted(data.get_analyses().iteritems()): #if not re.match(r".*\W.*", word): # continue word = word.lower() if len(analyses) == 0: x = "%s\t%s:NULL\n" % (word, word) #", ".join([" ".join(["%s:NULL" % m for m in a]) for a in analyses])) else: x = "%s\t%s\n" % (word, ", ".join([" ".join(["%s:NULL" % m.lower() for m in a]) for a in analyses])) ofd.write(x.encode("utf-8")) return None
def random_segmentations(target, source, env): def get_random_segmentation(w): stem_length = randint(1, len(w)) prefix_length = randint(0, len(w) - stem_length) suffix_length = randint(0, len(w) - (stem_length + prefix_length)) prefix = w[:prefix_length] stem = w[prefix_length:prefix_length + stem_length] suffix = w[prefix_length + stem_length:] return tuple([x for x in [prefix, stem, suffix] if len(x) > 0]) style_name = source[1].read() with meta_open(source[0].rstr()) as ifd: data = DataSet.from_stream(ifd)[-1] if style_name == "type-based": wordIndexToAnalysis = {} for i, w in data.indexToWord.iteritems(): wordIndexToAnalysis[i] = get_random_segmentation(w) sentences = [[(data.indexToWord[w], data.indexToTag.get(t, None), [wordIndexToAnalysis[w]]) for w, t, aa in s] for s in data.sentences] else: sentences = [[(data.indexToWord[w], data.indexToTag.get(t, None), [get_random_segmentation(data.indexToWord[w])]) for w, t, aa in s] for s in data.sentences] new_data = DataSet.from_sentences(sentences) with meta_open(target[0].rstr(), "w") as ofd: new_data.write(ofd) return None
def evaluate_tagging_vm(target, source, env): with meta_open(source[0].rstr()) as gold_fd, meta_open(source[1].rstr()) as pred_fd: gold = DataSet.from_stream(gold_fd)[-1] preds = DataSet.from_stream(pred_fd) #assert(len(gold.sentences) == len(pred.sentences)) scores = [] for pred in preds: for gold_sentence, pred_sentence in zip(gold.sentences, pred.sentences): assert(len(gold_sentence) == len(pred_sentence)) gold_tags = sum([[l[1] for l in s] for s in gold.sentences], []) pred_tags = sum([[l[1] for l in s] for s in pred.sentences], []) #scores = {} #scores["TRand"] = adjusted_rand(gold_tags, pred_tags) #scores["TmPur"] = harmonic_mean(modified_purity(gold_tags, pred_tags), modified_purity(pred_tags, gold_tags, 2)) #scores["TNIS"] = 1.0 - normalized_information_distance(gold_tags, pred_tags) scores.append({"VM" : v_measure(pred_tags, gold_tags)}) #names = scores.keys() names = ["VM"] with meta_open(target[0].rstr(), "w") as ofd: ofd.write("\t".join(names) + "\n") for score in scores: ofd.write("\t".join(["%.3f" % score[x] for x in names]) + "\n") return None
def top_words_by_tag(target, source, env): with meta_open(source[0].rstr()) as ifd: data = DataSet.from_stream(ifd)[-1] counts = numpy.zeros(shape=(len(data.indexToWord), len(data.indexToTag))) for sentence in data.sentences: for w, t, aa in sentence: counts[w, t] += 1 tag_totals = counts.sum(0) word_totals = counts.sum(1) keep = 10 with meta_open(target[0].rstr(), "w") as ofd: for tag_id, tag_total in enumerate(tag_totals): word_counts = counts[:, tag_id] #.argsort() indices = [(i, word_counts[i]) for i in reversed(word_counts.argsort())][0:keep] ofd.write(" ".join(["%s-%.2f-%.2f" % (data.indexToWord[i], float(c) / tag_total, float(c) / word_totals[i]) for i, c in indices]) + "\n") return None
def actual_filter(target, source, env): with meta_open(source[0].rstr()) as ifd: if "xml" in source[0].rstr(): words = sum([x.indexToWord.values() for x in DataSet.from_stream(ifd)], []) else: words = sum([[x for x in re.split(r"\s+", l.strip()) if not re.match(r".*\d.*", x)] for l in ifd], []) good, bad = split_words(target, source, env, words) try: with open(target[0].rstr(), "w") as good_ofd, open(target[1].rstr(), "w") as bad_ofd: good_ofd.write("\n".join(sorted(good)).encode("utf-8")) bad_ofd.write("\n".join(sorted(bad)).encode("utf-8")) except: with meta_open(target[0].rstr(), "w") as good_ofd, meta_open(target[1].rstr(), "w") as bad_ofd: good_ofd.write("\n".join(sorted(good))) bad_ofd.write("\n".join(sorted(bad))) return None
def oov_reduction(target, source, env): """ split expansions into buckets for each bucket 0 to N, output line of format: OOV_TOTAL_TOKENS, OOV_TOTAL_TYPES, BUCKET_TOTAL_TOKENS, BUCKET_TOTAL_TYPES, BUCKET_TOTAL_OOV_TOKENS, BUCKET_TOTAL_OOV_TYPES, BUCKET_ACCEPTED_TYPES """ if len(source) == 4: bucket_size = 1000 else: bucket_size = source[4].read() training_fname, expansion_fname, oov_fname, accepted_fname = [x.rstr() for x in source[0:4]] with meta_open(training_fname) as training_ifd, meta_open(expansion_fname) as expansion_ifd, meta_open(oov_fname) as oov_ifd, meta_open(accepted_fname) as accepted_ifd: training = set(DataSet.from_stream(training_ifd)[-1].indexToWord.values()) expansion = [(w, math.exp(-float(lp))) for w, lp in [l.split() for l in expansion_ifd] if w not in training] oov = {w : int(c) for c, w in [l.strip().split() for l in oov_ifd if w not in training]} accepted = set([x.strip() for x in accepted_ifd]) total_prob = sum([x[1] for x in expansion]) expansion = [(w, p / total_prob) for w, p in expansion] values = [[0, 0, 0, 0, 0]] buckets = len(expansion) / bucket_size #oov = {w : c for w, c in oov.iteritems() if w not in training} total_oov_tokens = sum(oov.values()) total_oov_types = len(oov) for bucket in range(buckets): bucket_total_tokens, bucket_total_types, bucket_total_oov_tokens, bucket_total_oov_types, accepted_types = values[bucket] for w, p in expansion[bucket * bucket_size : (bucket + 1) * bucket_size]: predicted_oov_count = total_oov_tokens * p bucket_total_tokens += predicted_oov_count bucket_total_types += 1 if w in accepted: accepted_types += 1 if w in oov: bucket_total_oov_tokens += oov[w] bucket_total_oov_types += 1 values.append([bucket_total_tokens, bucket_total_types, bucket_total_oov_tokens, bucket_total_oov_types, accepted_types]) with meta_open(target[0].rstr(), "w") as ofd: ofd.write("\t".join(["Total OOV Tokens", "Total OOV Types", "Bucket Total Types", "Bucket OOV Tokens", "Bucket Total OOV Types", "Bucket Accepted Types"]) + "\n") ofd.write("\n".join(["\t".join([str(int(y)) for y in [total_oov_tokens, total_oov_types] + x]) for x in values[1:]]) + "\n") return None
parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", dest="input") parser.add_argument("-o", "--output", dest="output") options = parser.parse_args() sizes = {"train" : 50000, "dev" : 8000, "test" :8000, #"small_eval" : 100000, } indices = {} with meta_open(options.input) as ifd: data = DataSet.from_stream(ifd) start = 0 end = 0 for name, size in sizes.iteritems(): total_words = 0 while total_words < size: total_words += len(data.sentences[end]) end += 1 indices[name] = (start, end) start = end indices["big_eval"] = (end, len(data.sentences)) for name in ["train", "dev", "test"]: start, end = indices[name] with meta_open(os.path.join(options.output, "pos", "%s.pos" % name), "w") as ofd: text = "\n\n".join(["\n".join(["%s\t%s" % (data.indexToWord[w].lower(), data.indexToTag[t]) for w, t, aa in s]) for s in data.sentences[start:end]])