def check_terminals(): from parse import parse_django, unescape grammar, parse_trees = parse_django( '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code' ) annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.anno' unique_terminals = set() invalid_terminals = set() for i, line in enumerate(open(annot_file)): parse_tree = parse_trees[i] utterance = line.strip() leaves = parse_tree.get_leaves() # tokens = set(nltk.word_tokenize(utterance)) leave_tokens = [l.label for l in leaves if l.label] not_included = [] for leaf_token in leave_tokens: leaf_token = str(leaf_token) leaf_token = unescape(leaf_token) if leaf_token not in utterance: not_included.append(leaf_token) if len(leaf_token) <= 15: unique_terminals.add(leaf_token) else: invalid_terminals.add(leaf_token) else: if isinstance(leaf_token, str): print leaf_token
def check_terminals(): from parse import parse_django, unescape grammar, parse_trees = parse_django('/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code') annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.anno' unique_terminals = set() invalid_terminals = set() for i, line in enumerate(open(annot_file)): parse_tree = parse_trees[i] utterance = line.strip() leaves = parse_tree.get_leaves() # tokens = set(nltk.word_tokenize(utterance)) leave_tokens = [l.label for l in leaves if l.label] not_included = [] for leaf_token in leave_tokens: leaf_token = str(leaf_token) leaf_token = unescape(leaf_token) if leaf_token not in utterance: not_included.append(leaf_token) if len(leaf_token) <= 15: unique_terminals.add(leaf_token) else: invalid_terminals.add(leaf_token) else: if isinstance(leaf_token, str): print leaf_token
def check_terminals(): from parse import parse_django, unescape grammar, parse_trees = parse_django( '/Users/carlosgemmell/Documents/projects/ReCode/data/all.code') annot_file = '/Users/carlosgemmell/Documents/projects/ReCode/data/all.anno' unique_terminals = set() invalid_terminals = set() for i, line in enumerate(open(annot_file)): parse_tree = parse_trees[i] utterance = line.strip() leaves = parse_tree.get_leaves() # tokens = set(nltk.word_tokenize(utterance)) leave_tokens = [l.label for l in leaves if l.label] not_included = [] for leaf_token in leave_tokens: leaf_token = str(leaf_token) leaf_token = unescape(leaf_token) if leaf_token not in utterance: not_included.append(leaf_token) if len(leaf_token) <= 15: unique_terminals.add(leaf_token) else: invalid_terminals.add(leaf_token) else: if isinstance(leaf_token, str): print leaf_token
def parse_django_dataset_nt_only(): from parse import parse_django annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.anno' vocab = gen_vocab(annot_file, vocab_size=4500) code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code' grammar, all_parse_trees = parse_django(code_file) train_data = DataSet(vocab, grammar, name='train') dev_data = DataSet(vocab, grammar, name='dev') test_data = DataSet(vocab, grammar, name='test') # train_data train_annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/train.anno' train_parse_trees = all_parse_trees[0:16000] for line, parse_tree in zip(open(train_annot_file), train_parse_trees): if parse_tree.is_leaf: continue line = line.strip() tokens = tokenize(line) entry = DataEntry(tokens, parse_tree) train_data.add(entry) train_data.init_data_matrices() # dev_data dev_annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/dev.anno' dev_parse_trees = all_parse_trees[16000:17000] for line, parse_tree in zip(open(dev_annot_file), dev_parse_trees): if parse_tree.is_leaf: continue line = line.strip() tokens = tokenize(line) entry = DataEntry(tokens, parse_tree) dev_data.add(entry) dev_data.init_data_matrices() # test_data test_annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/test.anno' test_parse_trees = all_parse_trees[17000:18805] for line, parse_tree in zip(open(test_annot_file), test_parse_trees): if parse_tree.is_leaf: continue line = line.strip() tokens = tokenize(line) entry = DataEntry(tokens, parse_tree) test_data.add(entry) test_data.init_data_matrices() serialize_to_file((train_data, dev_data, test_data), 'django.typed_rule.bin')