Пример #1
0
def read_graphs(reader: TSVReader, filename: str) -> List[NLPGraph]:
    logging.info('Reading: ' + os.path.basename(filename))
    reader.open(filename)
    graphs = reader.next_all
    reader.close()
    logging.info('- %s graphs' % len(graphs))
    return graphs
Пример #2
0
    def test_dependencyLabel(self):
        filename = '../../../resources/sample.tsv'
        reader = TSVReader(filename, 1, 2, 3, 4, 5, 6, 7, 8)
        graph = reader.next()
        nodes = []
        for node in graph.nodes:
            nodes.append(node)

        self.assertEqual(nodes[5].get_dependency_label(nodes[3]), 'advcl')
        self.assertEqual(nodes[3].get_dependency_label(nodes[5]), None)
        self.assertEqual(nodes[14].get_dependency_label(nodes[10]), 'ppmod')
Пример #3
0
    def test_nlpnode_set(self):
        filename = '../../../resources/sample.tsv'
        reader = TSVReader(filename, 1, 2, 3, 4, 5, 6, 7, 8)
        graph = reader.next()
        container = []
        for node in graph.nodes:
            container.append(node)

        tempnode1 = NLPNode("TEMP_1")
        tempnode2 = NLPNode("TEMP_2")
        container[0].set_parent(tempnode1, 'dep_1')
        self.assertEqual(container[0].parent, tempnode1)
Пример #4
0
    def test_getChild(self):
        filename = '../../../resources/sample.tsv'
        reader = TSVReader(filename, 1, 2, 3, 4, 5, 6, 7, 8)
        graph = reader.next()
        nodes = []
        for node in graph.nodes:
            nodes.append(node)

        self.assertEqual(nodes[14].get_leftmost_child(), nodes[11])
        self.assertEqual(nodes[19].get_leftmost_child(), nodes[16])
        self.assertEqual(nodes[3].get_leftmost_child(), nodes[1])
        self.assertEqual(nodes[3].get_rightmost_child(), nodes[20])
Пример #5
0
    def test_nlpnode_ancestor(self):
        filename = '../../../resources/sample.tsv'
        reader = TSVReader(filename, 1, 2, 3, 4, 5, 6, 7, 8)
        graph = reader.next()
        nodes = []
        for node in graph.nodes:
            nodes.append(node)

        self.assertEqual(nodes[0].parent, None)
        self.assertEqual(nodes[0].grandparent, None)
        self.assertEqual(nodes[3].parent, nodes[0])
        self.assertEqual(nodes[8].parent, nodes[10])
        self.assertEqual(nodes[8].grandparent, nodes[3])
        self.assertTrue(nodes[3].parent_of(nodes[0]))
        self.assertTrue(nodes[8].parent_of(nodes[10]))
Пример #6
0
def parse_args():
    parser = argparse.ArgumentParser('Train a part-of-speech tagger')

    # data
    args = argparse_data(
        parser, tsv=lambda t: TSVReader(word_index=t[0], pos_index=t[1]))
    args.add_argument('--log',
                      type=str,
                      metavar='filepath',
                      help='path to the logging file')

    # lexicon
    args = argparse_lexicon(parser)
    args.add_argument('--a2v',
                      type=str,
                      metavar='filepath',
                      help='path to the ambiguity class bin file')

    # model
    def feature_context(s: str):
        return tuple(map(int, s.split(',')))

    argparse_ffnn(parser)
    args = argparse_model(parser)
    args.add_argument('--feature_context',
                      type=feature_context,
                      metavar='int,int*',
                      default=[-2, -1, 0, 1, 2],
                      help='context window for feature extraction')

    return parser.parse_args()
Пример #7
0
            result.append((word, False, nament))
            if start is not None:
                result.append(
                    (graph.word, graph.pos.startswith('PRP'), graph.nament))
                start = None
        elif graph.nament.startswith('L'):
            nament = graph.nament.split("-")[1]
            word = " ".join([g.word for g in graphs.nodes[start:i + 2]])
            result.append((word, False, nament))
            start = None
        else:
            continue
    return result


if __name__ == '__main__':
    reader = TSVReader(1, 2, 3, 4, 5, 6, 7, 8)

    for k, thread in enumerate(tqdm(data)):
        for i, email in enumerate(thread['emails']):
            filename = "tsv/{}/{}.tsv".format(thread['path'], i)

            reader.open(filename)
            arr = []
            for nodes in reader.next_all:
                arr.extend(group(nodes))
            email['ents'] = arr

        outout_path = "{}/{}.json".format(outout_dir, k)
        with open(outout_path, 'w') as outfile:
            json.dump(thread, outfile)