def read_conll_doc(filename, ans=None, rtext=True, rparses=True, rheads=True, rclusters=True, rner=True, rspeakers=True, rfcol=False): if ans is None: ans = {} cur = [] keys = None for line in codecs.open(filename, 'r', 'utf-8'): if len(line) > 0 and line.startswith('#begin') or line.startswith( '#end'): if 'begin' in line: desc = line.split() location = desc[2].strip('();') keys = (location, desc[-1]) if len(cur) > 0: if keys is None: print >> sys.stderr, "Error reading conll file - invalid #begin statemen\n", line else: info = {} if rtext: info['text'] = read_conll_text(cur) if rparses: info['parses'] = read_conll_parses(cur) if rheads: info['heads'] = [ head_finder.collins_find_heads(parse) for parse in info['parses'] ] if rclusters: info['mentions'], info['clusters'] = read_conll_coref( cur) if rner: info['ner'] = read_conll_ner(cur) if rspeakers: info['speakers'] = read_conll_speakers(cur) if rfcol: info['fcol'] = read_conll_fcol(cur) if keys[0] not in ans: ans[keys[0]] = {} ans[keys[0]][keys[1]] = info keys = None cur = [] else: cur.append(line) return ans
def read_conll_doc(filename, ans=None, rtext=True, rparses=True, rheads=True, rclusters=True, rner=True, rspeakers=True, rfcol=False): # Canasai's addition end # Read entire file, inserting into a dictionary: # key - the #begin <blah> info # value - a dict, one entry per part, each entry contains: # - text # - parses # - heads # - coreference clusters if ans is None: ans = defaultdict(lambda: {}) cur = [] keys = None # Canasai's comment out: for line in open(filename): for line in codecs.open(filename, 'r', 'utf-8'): if len(line) > 0 and line.startswith('#begin') or line.startswith('#end'): if 'begin' in line: desc = line.split() location = desc[2].strip('();') keys = (location, desc[-1]) # Canasai's comment out: if "tc/ch/00/ch" in keys[0] and '9' not in keys[0]: # val = int(keys[0].split('_')[-1]) * 10 - 1 # keys = ("tc/ch/00/ch_%04d" % val, keys[1]) if len(cur) > 0: if keys is None: print >> sys.stderr, "Error reading conll file - invalid #begin statemen\n", line else: info = {} if rtext: info['text'] = read_conll_text(cur) if rparses: info['parses'] = read_conll_parses(cur) if rheads: info['heads'] = [head_finder.collins_find_heads(parse) for parse in info['parses']] if rclusters: info['mentions'], info['clusters'] = read_conll_coref(cur) if rner: info['ner'] = read_conll_ner(cur) # Canasai's addition begin if rspeakers: info['speakers'] = read_conll_speakers(cur) if rfcol: info['fcol'] = read_conll_fcol(cur) # Canasai's addition end ans[keys[0]][keys[1]] = info keys = None cur = [] else: cur.append(line) return ans
def get_tree_data(sentence_list, parse_list, ner_list, character_to_index, word_to_index, pos_to_index, index_to_lexicon): log("get_tree_data()...") """ Get tree structured data from CoNLL-2003 Stores into Node data structure """ tree_pyramid_list = [] word_count = 0 pos_count = defaultdict(lambda: 0) ne_count = defaultdict(lambda: 0) pos_ne_count = defaultdict(lambda: 0) lexicon_hits = [0] for index, parse in enumerate(parse_list): text_raw_data = sentence_list[index] word_count += len(text_raw_data) span_to_node = {} head_raw_data = head_finder.collins_find_heads(parse) root_node = Node() nodes = construct_node(root_node, parse, ner_list[index], head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node) root_node.nodes = nodes root_node.tokens = len(text_raw_data) additional_node_list = create_dense_nodes(ner_list[index], text_raw_data, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node) tree_pyramid_list.append((root_node, additional_node_list)) log(" %d sentences\n" % len(tree_pyramid_list)) return tree_pyramid_list, word_count, pos_count, ne_count, pos_ne_count, lexicon_hits[ 0]
def read_conll_doc(filename, ans=None, rtext=True, rparses=True, rheads=True, rclusters=True, rner=True, rspeakers=True, rfcol=False): if ans is None: ans = {} cur = [] keys = None for line in codecs.open(filename, 'r', 'utf-8'): if len(line) > 0 and line.startswith('#begin') or line.startswith('#end'): if 'begin' in line: desc = line.split() location = desc[2].strip('();') keys = (location, desc[-1]) if len(cur) > 0: if keys is None: print >> sys.stderr, "Error reading conll file - invalid #begin statemen\n", line else: info = {} if rtext: info['text'] = read_conll_text(cur) if rparses: info['parses'] = read_conll_parses(cur) if rheads: info['heads'] = [head_finder.collins_find_heads(parse) for parse in info['parses']] if rclusters: info['mentions'], info['clusters'] = read_conll_coref(cur) if rner: info['ner'] = read_conll_ner(cur) if rspeakers: info['speakers'] = read_conll_speakers(cur) if rfcol: info['fcol'] = read_conll_fcol(cur) if keys[0] not in ans: ans[keys[0]] = {} ans[keys[0]][keys[1]] = info keys = None cur = [] else: cur.append(line) return ans
def get_tree_data(sentence_list, parse_list, ner_list, character_to_index, word_to_index, pos_to_index, index_to_lexicon): log("get_tree_data()...") """ Get tree structured data from CoNLL-2003 Stores into Node data structure """ tree_pyramid_list = [] word_count = 0 pos_count = defaultdict(lambda: 0) ne_count = defaultdict(lambda: 0) pos_ne_count = defaultdict(lambda: 0) lexicon_hits = [0] for index, parse in enumerate(parse_list): text_raw_data = sentence_list[index] word_count += len(text_raw_data) span_to_node = {} head_raw_data = head_finder.collins_find_heads(parse) root_node = Node() nodes = construct_node( root_node, parse, ner_list[index], head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node) root_node.nodes = nodes root_node.tokens = len(text_raw_data) additional_node_list = create_dense_nodes( ner_list[index], text_raw_data, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node) tree_pyramid_list.append((root_node, additional_node_list)) log(" %d sentences\n" % len(tree_pyramid_list)) return tree_pyramid_list, word_count, pos_count, ne_count, pos_ne_count, lexicon_hits[0]