def random_tags(target, source, env): num_tags = env["NUM_TAGS"] style_name = source[1].read() with meta_open(source[0].rstr()) as ifd: data = DataSet.from_stream(ifd)[-1] if style_name == "type-based": wordIndexToTag = {i : randint(0, num_tags) for i in data.indexToWord.keys()} new_data = DataSet.from_sentences([[(data.indexToWord[w], str(wordIndexToTag[w]), [data.indexToAnalysis[a] for a in aa]) for w, t, aa in s] for s in data.sentences]) else: new_data = DataSet.from_sentences([[(data.indexToWord[w], str(randint(0, num_tags)), [data.indexToAnalysis[a] for a in aa]) for w, t, aa in s] for s in data.sentences]) with meta_open(target[0].rstr(), "w") as ofd: new_data.write(ofd) return None
def conllish_to_xml(target, source, env): with meta_open(source[0].rstr()) as ifd: sentences = [[(w, t, []) for w, t in [re.split(r"\s+", x) for x in s.split("\n") if not re.match(r"^\s*$", x)]] for s in re.split(r"\n\n", ifd.read(), flags=re.M)] data = DataSet.from_sentences(sentences) with meta_open(target[0].rstr(), "w") as ofd: data.write(ofd) return None
def rtm_to_data(target, source, env): sentences = [] with meta_open(source[0].rstr()) as ifd: for sentence in ifd: words = [w for w in sentence.split()[5:] if w not in ["(())", "IGNORE_TIME_SEGMENT_IN_SCORING"]] if len(words) > 0: sentences.append(words) dataset = DataSet.from_sentences([[(w, None, []) for w in s] for s in sentences]) with meta_open(target[0].rstr(), "w") as ofd: dataset.write(ofd) return None
def add_morphology(target, source, env): with meta_open(source[0].rstr()) as ifd: data = DataSet.from_stream(ifd)[-1] morphology = {} with meta_open(source[1].rstr()) as ifd: for l in ifd: word, analyses = l.split("\t") morphology[word] = set() for analysis in analyses.split(", "): morphology[word].add(tuple([morph.split(":")[0] for morph in analysis.split() if not morph.startswith("~")])) #print [[(data.indexToWord[w], data.indexToTag.get(t, None), morphology.get(data.indexToWord[w], [])) for w, t, aa in s] for s in data.sentences][0:10] new_data = DataSet.from_sentences([[(data.indexToWord[w], data.indexToTag.get(t, None), get_without_case(data.indexToWord[w], morphology)) for w, t, aa in s] for s in data.sentences]) with meta_open(target[0].rstr(), "w") as ofd: new_data.write(ofd) return None
def random_segmentations(target, source, env): def get_random_segmentation(w): stem_length = randint(1, len(w)) prefix_length = randint(0, len(w) - stem_length) suffix_length = randint(0, len(w) - (stem_length + prefix_length)) prefix = w[:prefix_length] stem = w[prefix_length:prefix_length + stem_length] suffix = w[prefix_length + stem_length:] return tuple([x for x in [prefix, stem, suffix] if len(x) > 0]) style_name = source[1].read() with meta_open(source[0].rstr()) as ifd: data = DataSet.from_stream(ifd)[-1] if style_name == "type-based": wordIndexToAnalysis = {} for i, w in data.indexToWord.iteritems(): wordIndexToAnalysis[i] = get_random_segmentation(w) sentences = [[(data.indexToWord[w], data.indexToTag.get(t, None), [wordIndexToAnalysis[w]]) for w, t, aa in s] for s in data.sentences] else: sentences = [[(data.indexToWord[w], data.indexToTag.get(t, None), [get_random_segmentation(data.indexToWord[w])]) for w, t, aa in s] for s in data.sentences] new_data = DataSet.from_sentences(sentences) with meta_open(target[0].rstr(), "w") as ofd: new_data.write(ofd) return None