示例#1
0
文件: utils.py 项目: ml-lab/AdvCodec
def parse_by_token(tokens, splited_tokens):
    """
    :param tokens: list(list(str)) : a paragraph of sentences
    :return: triples, words, masks, failure
    """
    parser = CoreNLPDependencyParser(url='http://localhost:9000')
    parses = []

    for sent in splited_tokens:
        parses.append(parser.parse_sents([sent], properties={
            'tokenize.options': 'ptb3Escaping=false, normalizeFractions=false'}))

    parse_graphs = []
    try:
        for parse in parses:
            parse_graph = []
            parse_sents = list(parse)
            for i in parse_sents:
                for j in i:
                    if j is not None:
                        parse_graph.append(list(j.nodes.values()))
            if len(parse_graph) > 1:
                parse_graph = [None]
            parse_graphs += parse_graph
        if len(parse_graphs) != len(splited_tokens):
            raise Exception(
                "parsed {} senteces more than original {} sentences".format(len(parse_graphs), len(splited_tokens)))
    except Exception as e:
        print(e)
        return [None], tokens, [False], 1

    triples = []
    tokenized_p = []
    masks = []
    failure = 0

    for i, parse_graph in enumerate(parse_graphs):
        if parse_graph is not None:
            parse_values = []
            for k in parse_graph:
                if k is not None:
                    parse_values.append(k)
                else:
                    print("NONE happened", tokens)
            parse_values.sort(key=lambda x: x["address"])
            parse_values = parse_values[1:]
            triple, tokens, mask = parse_sentence(parse_values, splited_tokens[i])
            if triple is None:
                failure += 1
            triples.append(triple)
            tokenized_p.append(tokens)
            masks.append(mask)
        else:
            triples.append(None)
            tokenized_p.append(splited_tokens[i])
            masks.append(False)
            failure += 1

    return triples, tokenized_p, masks, failure
示例#2
0
def generate_individual_conllu(input_dir, output_dir):
    """
    """

    def _chunks(l, n):
        """Yield successive n-sized chunks from l."""
        for i in range(0, len(l), n):
            yield l[i:i + n]

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

    already_generated = os.listdir(output_dir)

    for file in sorted(os.listdir(input_dir)):

        output_file_name = '%s.conllu' % file
        if output_file_name not in already_generated:  # don't parse again files already parsed!

            input_file = '%s/%s' % (input_dir, file)
            output_file = '%s/%s' % (output_dir, output_file_name)

            with open(input_file, "r", encoding='utf-8') as i_f, open(output_file, "w", encoding='utf-8') as o_f1:
                sent_id = 1
                nl = 1
                lines = i_f.readlines()
                for chunk in _chunks(lines, 500):

                    sentences = [a.split() for a in chunk]
                    sentences_parses = dep_parser.parse_sents(sentences)
                    for i, sentence_parse in enumerate(sentences_parses):
                        for parse in sentence_parse:
                            conll = parse.to_conll(style=10)

                            # write to output_file
                            o_f1.write("# sent_id = %s\n" % (nl))
                            o_f1.write("%s\n" % conll)
                            nl+=1

                            if nl % 1000 == 0:
                                logging.info("file: %s; sent_id = %s" % (output_file, nl))

            i_f.close()
            o_f1.close()

        else:
            logging.info('skipping %s; already parsed!' % output_file_name)