def parse_by_token(tokens, splited_tokens): """ :param tokens: list(list(str)) : a paragraph of sentences :return: triples, words, masks, failure """ parser = CoreNLPDependencyParser(url='http://localhost:9000') parses = [] for sent in splited_tokens: parses.append(parser.parse_sents([sent], properties={ 'tokenize.options': 'ptb3Escaping=false, normalizeFractions=false'})) parse_graphs = [] try: for parse in parses: parse_graph = [] parse_sents = list(parse) for i in parse_sents: for j in i: if j is not None: parse_graph.append(list(j.nodes.values())) if len(parse_graph) > 1: parse_graph = [None] parse_graphs += parse_graph if len(parse_graphs) != len(splited_tokens): raise Exception( "parsed {} senteces more than original {} sentences".format(len(parse_graphs), len(splited_tokens))) except Exception as e: print(e) return [None], tokens, [False], 1 triples = [] tokenized_p = [] masks = [] failure = 0 for i, parse_graph in enumerate(parse_graphs): if parse_graph is not None: parse_values = [] for k in parse_graph: if k is not None: parse_values.append(k) else: print("NONE happened", tokens) parse_values.sort(key=lambda x: x["address"]) parse_values = parse_values[1:] triple, tokens, mask = parse_sentence(parse_values, splited_tokens[i]) if triple is None: failure += 1 triples.append(triple) tokenized_p.append(tokens) masks.append(mask) else: triples.append(None) tokenized_p.append(splited_tokens[i]) masks.append(False) failure += 1 return triples, tokenized_p, masks, failure
def generate_individual_conllu(input_dir, output_dir): """ """ def _chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] if not os.path.exists(output_dir): os.mkdir(output_dir) dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') already_generated = os.listdir(output_dir) for file in sorted(os.listdir(input_dir)): output_file_name = '%s.conllu' % file if output_file_name not in already_generated: # don't parse again files already parsed! input_file = '%s/%s' % (input_dir, file) output_file = '%s/%s' % (output_dir, output_file_name) with open(input_file, "r", encoding='utf-8') as i_f, open(output_file, "w", encoding='utf-8') as o_f1: sent_id = 1 nl = 1 lines = i_f.readlines() for chunk in _chunks(lines, 500): sentences = [a.split() for a in chunk] sentences_parses = dep_parser.parse_sents(sentences) for i, sentence_parse in enumerate(sentences_parses): for parse in sentence_parse: conll = parse.to_conll(style=10) # write to output_file o_f1.write("# sent_id = %s\n" % (nl)) o_f1.write("%s\n" % conll) nl+=1 if nl % 1000 == 0: logging.info("file: %s; sent_id = %s" % (output_file, nl)) i_f.close() o_f1.close() else: logging.info('skipping %s; already parsed!' % output_file_name)