def main(): logging.basicConfig( level="INFO", format="%(asctime)s : " + "%(module)s (%(lineno)s) - %(levelname)s - %(message)s") id_field, word_field, lemma_field, msd_field, gov_field, dep_field = (0, 1, None, None, -4, -3) with open(sys.argv[1]) as stream: c = 0 for sentence in sentence_iterator(stream, comment_tag='#'): try: deps = get_dependencies(sentence, id_field, word_field, lemma_field, msd_field, gov_field, dep_field) except: print(sentence) sys.exit(-1) print(deps) sys.exit(-1) c += 1 if c % 1000 == 0: print(c) print(c)
def main(): logging.basicConfig( level="INFO", format="%(asctime)s : " + "%(module)s (%(lineno)s) - %(levelname)s - %(message)s") cfg_file = sys.argv[1] cfg = get_cfg(cfg_file) text_to_4lang = TextTo4lang(cfg) fn = cfg.get('text', 'input_sens') base_fn = os.path.basename(fn) deps_fn = os.path.join(text_to_4lang.deps_dir, "{0}.deps".format(base_fn)) if text_to_4lang.lang == 'hu': id_field, word_field, lemma_field, msd_field, gov_field, dep_field = ( 0, 1, 3, 4, -4, -2) else: id_field, word_field, lemma_field, msd_field, gov_field, dep_field = ( 0, 1, None, None, -4, -3) deps = map(lambda s: get_dependencies( s, id_field, word_field, lemma_field, msd_field, gov_field, dep_field), sentence_iterator(open(fn))) if text_to_4lang.lang == 'en': c_deps = [] for sen in deps: c_deps.append([]) for d in sen: c_deps[-1].append(( d['type'], (d['gov']['word'], d['gov']['id']), (d['dep']['word'], d['dep']['id']))) # convert to old deps (for now, see issue #51) else: c_deps = deps with open(deps_fn, 'w') as out_f: out_f.write("{0}\n".format(json.dumps({ "deps": c_deps, "corefs": []}))) text_to_4lang.process_deps(deps_fn)
def main(): logging.basicConfig( level="INFO", format="%(asctime)s : " + "%(module)s (%(lineno)s) - %(levelname)s - %(message)s") cfg_file = sys.argv[1] cfg = get_cfg(cfg_file) text_to_4lang = TextTo4lang(cfg) fn = cfg.get('text', 'input_sens') base_fn = os.path.basename(fn) deps_fn = os.path.join(text_to_4lang.deps_dir, "{0}.deps".format(base_fn)) if text_to_4lang.lang == 'hu': id_field, word_field, lemma_field, msd_field, gov_field, dep_field = ( 0, 1, 3, 4, -4, -2) else: id_field, word_field, lemma_field, msd_field, gov_field, dep_field = ( 0, 1, None, None, -4, -3) deps = map( lambda s: get_dependencies(s, id_field, word_field, lemma_field, msd_field, gov_field, dep_field), sentence_iterator(open(fn))) if text_to_4lang.lang == 'en': c_deps = [] for sen in deps: c_deps.append([]) for d in sen: c_deps[-1].append( (d['type'], (d['gov']['word'], d['gov']['id']), (d['dep']['word'], d['dep']['id']))) # convert to old deps (for now, see issue #51) else: c_deps = deps with open(deps_fn, 'w') as out_f: out_f.write("{0}\n".format(json.dumps({"deps": c_deps, "corefs": []}))) text_to_4lang.process_deps(deps_fn)
def main(): print(HEADER) id_field, word_field, lemma_field, msd_field, gov_field, dep_field = (0, 1, None, None, -4, -3) global SEEN global GRAPH_STRING with open(sys.argv[1]) as stream: for sentence in sentence_iterator(stream, comment_tag='#'): deps = get_dependencies(sentence, id_field, word_field, lemma_field, msd_field, gov_field, dep_field) sentence_dict, root_token = deps_to_sen_dict(deps) # root token will be the first token if ROOT doesn't exist if root_token is None: root_token = sentence_dict.keys()[0] SEEN = {} GRAPH_STRING = '' dict_to_graph(sentence_dict, root_token) print(GRAPH_STRING)
def conll_to_deps(stream): for sen in sentence_iterator(stream): yield get_dependencies(sen)