def read_graphs(reader: TSVReader, filename: str) -> List[NLPGraph]: logging.info('Reading: ' + os.path.basename(filename)) reader.open(filename) graphs = reader.next_all reader.close() logging.info('- %s graphs' % len(graphs)) return graphs
def test_dependencyLabel(self): filename = '../../../resources/sample.tsv' reader = TSVReader(filename, 1, 2, 3, 4, 5, 6, 7, 8) graph = reader.next() nodes = [] for node in graph.nodes: nodes.append(node) self.assertEqual(nodes[5].get_dependency_label(nodes[3]), 'advcl') self.assertEqual(nodes[3].get_dependency_label(nodes[5]), None) self.assertEqual(nodes[14].get_dependency_label(nodes[10]), 'ppmod')
def test_nlpnode_set(self): filename = '../../../resources/sample.tsv' reader = TSVReader(filename, 1, 2, 3, 4, 5, 6, 7, 8) graph = reader.next() container = [] for node in graph.nodes: container.append(node) tempnode1 = NLPNode("TEMP_1") tempnode2 = NLPNode("TEMP_2") container[0].set_parent(tempnode1, 'dep_1') self.assertEqual(container[0].parent, tempnode1)
def test_getChild(self): filename = '../../../resources/sample.tsv' reader = TSVReader(filename, 1, 2, 3, 4, 5, 6, 7, 8) graph = reader.next() nodes = [] for node in graph.nodes: nodes.append(node) self.assertEqual(nodes[14].get_leftmost_child(), nodes[11]) self.assertEqual(nodes[19].get_leftmost_child(), nodes[16]) self.assertEqual(nodes[3].get_leftmost_child(), nodes[1]) self.assertEqual(nodes[3].get_rightmost_child(), nodes[20])
def test_nlpnode_ancestor(self): filename = '../../../resources/sample.tsv' reader = TSVReader(filename, 1, 2, 3, 4, 5, 6, 7, 8) graph = reader.next() nodes = [] for node in graph.nodes: nodes.append(node) self.assertEqual(nodes[0].parent, None) self.assertEqual(nodes[0].grandparent, None) self.assertEqual(nodes[3].parent, nodes[0]) self.assertEqual(nodes[8].parent, nodes[10]) self.assertEqual(nodes[8].grandparent, nodes[3]) self.assertTrue(nodes[3].parent_of(nodes[0])) self.assertTrue(nodes[8].parent_of(nodes[10]))
def parse_args(): parser = argparse.ArgumentParser('Train a part-of-speech tagger') # data args = argparse_data( parser, tsv=lambda t: TSVReader(word_index=t[0], pos_index=t[1])) args.add_argument('--log', type=str, metavar='filepath', help='path to the logging file') # lexicon args = argparse_lexicon(parser) args.add_argument('--a2v', type=str, metavar='filepath', help='path to the ambiguity class bin file') # model def feature_context(s: str): return tuple(map(int, s.split(','))) argparse_ffnn(parser) args = argparse_model(parser) args.add_argument('--feature_context', type=feature_context, metavar='int,int*', default=[-2, -1, 0, 1, 2], help='context window for feature extraction') return parser.parse_args()
result.append((word, False, nament)) if start is not None: result.append( (graph.word, graph.pos.startswith('PRP'), graph.nament)) start = None elif graph.nament.startswith('L'): nament = graph.nament.split("-")[1] word = " ".join([g.word for g in graphs.nodes[start:i + 2]]) result.append((word, False, nament)) start = None else: continue return result if __name__ == '__main__': reader = TSVReader(1, 2, 3, 4, 5, 6, 7, 8) for k, thread in enumerate(tqdm(data)): for i, email in enumerate(thread['emails']): filename = "tsv/{}/{}.tsv".format(thread['path'], i) reader.open(filename) arr = [] for nodes in reader.next_all: arr.extend(group(nodes)) email['ents'] = arr outout_path = "{}/{}.json".format(outout_dir, k) with open(outout_path, 'w') as outfile: json.dump(thread, outfile)