def read_data(lines, tree_flag, score_cat_rex=None, ignore_terminal_rex=None, word_split_rex=None, debug_level=0): "Reads data either in tree format or in flat format" if tree_flag: words = [] for line in lines: trees = tb.string_trees(line) trees.insert(0, 'ROOT') words.append(tree_string(trees, score_cat_rex, ignore_terminal_rex)) if debug_level >= 1000: sys.stderr.write("# line = %s,\n# words = %s\n" % (line, words[-1])) else: words = [[ w for w in word_split_rex.split(line) if w != '' and not ignore_terminal_rex.match(w) ] for line in lines] segments = [''.join(ws) for ws in words] if debug_level >= 10000: sys.stderr.write("lines[0] = %s\n" % lines[0]) sys.stderr.write("words[0] = %s\n" % words[0]) sys.stderr.write("segments[0] = %s\n" % segments[0]) stringpos = [words_stringpos(ws) for ws in words] return (segments, stringpos)
def read_data(inf, tree_flag, score_cat_rex, ignore_terminal_rex, debug_level=0, max_lines=0): "Reads data from inf, either in tree format or in flat format" strippedlines = [line.strip() for line in inf] if max_lines > 0: if len(strippedlines) < max_lines: sys.stderr.write("Warning: max_lines=%d, len(strippedlines) = %d\n"%(max_lines,len(strippedlines))) strippedlines = strippedlines[0:max_lines] if tree_flag: lines = [] for line in strippedlines: trees = tb.string_trees(line) trees.insert(0, 'ROOT') lines.append(tree_words(trees, score_cat_rex, ignore_terminal_rex)) if debug_level >= 10000: sys.stderr.write("# line = %s,\n# words = %s\n"%(line, lines[-1])) else: lines = [[string_word(s) for s in line.split()] for line in strippedlines] # print "# tree_flag =", tree_flag, "source =", strippedlines[-1], "line =", lines[-1] sentences = [''.join((w[0] if isinstance(w,tuple) else w for w in ws)) for ws in lines] topics = [words_topics(line) for line in lines] stringpos = [words_stringpos(ws) for ws in lines] topicstringpos = [words_topicstringpos(ws) for ws in lines] return (sentences,(stringpos,topics,topicstringpos))
def read_write(inf, outf=sys.stdout, nskip=0): "Reads data from inf in tree format" for line in inf: line = line.strip() if len(line) > 0: if nskip <= 0: trees = tb.string_trees(line) trees.insert(0, 'ROOT') outf.write(tree_string(trees).strip()) outf.write('\n') else: if nskip <= 0: outf.write('\n') outf.flush() nskip -= 1 trees = tb.string_trees(line) trees.insert(0, 'ROOT')
def read_write(inf, outf=sys.stdout, nskip=0): "Reads data from inf in tree format" for line in inf: line = line.strip() #hacky way of exlucding the spurious entries if line.count("lexentry")>0: continue if len(line) > 0: if nskip <= 0: trees = tb.string_trees(line) trees.insert(0, 'ROOT') outf.write(tree_string(trees).strip()) outf.write('\n') else: if nskip <= 0: outf.write('\n') outf.flush() nskip -= 1 trees = tb.string_trees(line) trees.insert(0, 'ROOT')
def setUp(self): gstr = """(S (EDITED (NP (EX there)) (, ,)) (NP (EX there)) (VP (BES 's) (NP (DT no) (NN way))) (. .)) (S (CC and) (, ,) (INTJ (UH uh)) (PRN (, ,) (S (NP (PRP you)) (VP (VBP know))) (, ,)) (NP (DT all))) (S (EDITED (EDITED (EDITED (S (NP (EX There)) (VP (BES 's))) (, ,)) (NP (EX there)) (, ,)) (NP (DT th-)) (, ,)) (NP (DT this) (NN topic)) (VP (VBZ is) (ADJP (ADVP (RB kind) (RB of)) (TYPO (JJ mute))) (. .) (INTJ (UH Uh)))) """ pstr = """(S (NP (EX there)) (, ,) (NP (EX there)) (VP (BES 's) (NP (DT no) (NN way))) (. .)) (S1 (CC and) (, ,) (INTJ (UH uh)) (, ,) (PRN (S (NP (PRP you)) (VP (VBP know)))) (, ,) (NP (DT all))) (S (EDITED (EDITED (EDITED (S (NP (EX There)) (VP (BES 's))) (, ,)) (NP (EX there)) (, ,)) (NP (DT th-)) (, ,)) (NP (DT this) (NN topic)) (VP (VBZ is) (ADJP (ADVP (RB kind) (RB of)) (TYPO (JJ mute))) (. .) (INTJ (UH Uh)))) """ self.gs = tb.string_trees(gstr) self.ps = tb.string_trees(pstr) self.e_no_words = EvalParse() self.e_no_words(self.ps, self.gs) self.e_no_words_tbl = self.e_no_words.table() # print(self.e_no_words_tbl) self.e_words = EvalParse(evaluate_word_coverage=True) self.e_words(self.ps, self.gs) self.e_words_tbl = self.e_words.table()
def getAnalysis(sTree, wCat, sCat, wordSep, segSep, sylSep, noLabel): """ Stores the yield of a tree with syllabic and segmental information each segment is indexed with "_C" or "_V" (consonant or vowel) if noLabel == FALSE each syllable is separated by sylSep each word is separated by wordSep each segment is separated by segSep """ tree = tb.string_trees(sTree) tree.insert(0, 'ROOT') words = [] visitTree(tree, words, wCat, sCat, segSep, sylSep, noLabel) return wordSep.join(words)
def readwords(fname, wordrex, wordtype_word_fname_count): inf = file(fname, "rU") for line in inf: if len(line) > 0 and line[0] == '(': trees = tb.string_trees(line.strip()) assert(len(trees) == 1) tree = trees[0] assert(isinstance(tree, list)) label = tree[0] mo = wordrex.match(label) if mo: wordtype = mo.group(1) wordcount = int(mo.group(2)) word = ''.join(tb.terminals(tree)) lx.incr3(wordtype_word_fname_count, wordtype, word, fname, wordcount)
def read_data(lines, tree_flag, score_cat_rex=None, ignore_terminal_rex=None, word_split_rex=None, debug_level=0): "Reads data either in tree format or in flat format" if tree_flag: words = [] for line in lines: if line.count("lexentry")>0: continue trees = tb.string_trees(line) trees.insert(0, 'ROOT') words.append(tree_string(trees, score_cat_rex, ignore_terminal_rex)) if debug_level >= 1000: sys.stderr.write("# line = %s,\n# words = %s\n"%(line, words[-1])) else: words = [[w for w in word_split_rex.split(line) if w != '' and not ignore_terminal_rex.match(w)] for line in lines if line.count("lexentry")==0] segments = [''.join(ws) for ws in words] if debug_level >= 10000: sys.stderr.write("lines[0] = %s\n"%lines[0]) sys.stderr.write("words[0] = %s\n"%words[0]) sys.stderr.write("segments[0] = %s\n"%segments[0]) stringpos = [words_stringpos(ws) for ws in words] return (segments,stringpos)
def read_data(inf, tree_flag, types_flag, score_cat_rex, ignore_terminal_rex, word_split_rex, debug_level=0, max_lines=0): "Reads data from inf, either in tree format or in flat format" strippedlines = [line.strip() for line in inf] if max_lines > 0: if len(strippedlines) < max_lines: sys.stderr.write("Warning: max_lines=%d, len(strippedlines) = %d\n"%(max_lines,len(strippedlines))) strippedlines = strippedlines[0:max_lines] if tree_flag: lines0 = [] for line in strippedlines: trees = tb.string_trees(line) trees.insert(0, 'ROOT') lines0.append(tree_words(trees, score_cat_rex, ignore_terminal_rex)) if debug_level >= 10000: sys.stderr.write("# line = %s,\n# words = %s\n"%(line, lines0[-1])) else: lines0 = [[word for word in word_split_rex.split(line) if word != ""] for line in strippedlines] if types_flag: lines = [] dejavu = set() for words in lines0: word = ''.join(words) if word not in dejavu: dejavu.add(word) lines.append(words) else: lines = lines0 # print "# tree_flag =", tree_flag, "source =", strippedlines[-1], "line =", lines[-1] sentences = [''.join(ws) for ws in lines] stringpos = [words_stringpos(ws) for ws in lines] return (sentences,stringpos)