total = 0 wrong = 0 sentence = None with open(args.infile) as f: for line in f: if sentence: total += 1 correct_edge = hedge(line.strip()) parser_output = parser.parse(sentence) parsed_sentence = parser_output['parses'][0] edge = parsed_sentence['main_edge'] sent = parsed_sentence['spacy_sentence'] if edge != correct_edge: wrong += 1 print_tree(sent.root) print('expected:') print(correct_edge) print('result:') print(edge) sentence = None else: sentence = line.strip() print('%s wrong out of %s.' % (wrong, total)) if __name__ == '__main__': wrapper(test_parser, text='parser tests')
import random from graphbrain.cli import wrapper from graphbrain.parsers import create_parser def extract_sentences(args): parser = create_parser(name=args.lang) sentences = [] count = 0 with open(args.infile, 'r') as infile, open(args.outfile, 'w') as outfile: for line in infile.readlines(): paragraph = line.strip() if len(paragraph) > 0: parse_results = parser.parse(paragraph) for parse in parse_results['parses']: sentences.append(parse['text']) count += 1 if count % 100 == 0: print('{} sentences found'.format(count)) random.shuffle(sentences) with open(args.outfile, 'w') as outfile: for sentence in sentences: outfile.write('{}\n'.format(sentence)) if __name__ == '__main__': wrapper(extract_sentences, text='extract sentences')
remaining += 1 source_name = source[:-1] sources[source_name] += 1 sentences_by_source[source_name].append(sentence) else: empty += 1 if empty > len(self.input_files): break self._close_input_files() # write files per source for source in sentences_by_source: sentences = sentences_by_source[source] random.shuffle(sentences) with open(join(outdir, '{}.csv'.format(source)), 'w') as f: for sentence in sentences: f.write('{}\n'.format(sentence)) print('existing: {}; remaining: {}'.format(len(self.sentences), remaining)) print(sources) def extract_remaining_sentences(args): SentenceExtractor().extract(args.indir, args.infile, args.outdir) if __name__ == '__main__': wrapper(extract_remaining_sentences, text='extract remaining sentences')
f = self.input_files[name] yield f.readline(), name def generate(self, indir, outfile): self._open_input_files(indir) self.load_sentences(outfile) with open(outfile, 'a') as outfile: for sentence, source in self._sentences(): sentence = sentence.strip() if sentence not in self.sentences: self.print_counts() case = None while case is None: case = self.annotate_sentence(sentence, source) outfile.write('{}\n'.format(json.dumps(case))) self.sentences.add(sentence) self.update_counts(case) self._close_input_files() def generate_parser_training_data(args): TrainingDataGenerator(args.lang).generate(args.indir, args.outfile) if __name__ == '__main__': wrapper(generate_parser_training_data, text='generate parser training data')
pos_after = row[19] source = row[25][:-1] X.append((tag, dep, hpos, hdep, pos_after)) y.append(true_value) sources.append(source) preds = alpha.predict(X) n_source = Counter() correct_source = Counter() for pred, true_value, source in zip(preds, y, sources): n += 1 n_source[source] += 1 if pred == true_value: correct += 1 correct_source[source] += 1 for source in n_source: accuracy = float(correct_source[source]) / float(n_source[source]) print('{} accuracy: {} [{} correct out of {}]'.format( source, accuracy, correct_source[source], n_source[source])) print() accuracy = float(correct) / float(n) print('overall accuracy: {} [{} correct out of {}]'.format( accuracy, correct, n)) if __name__ == '__main__': wrapper(test_alpha, text='test alpha classifier')
def run(self): new_features = ALL_FEATURES cur_features = None i = 1 # ablation stage while new_features != cur_features: self._log('\n>>> ITERATION {} <<<'.format(i)) i += 1 cur_features = new_features new_features = self._ablate(cur_features) # regrowth stage cur_features = None while new_features != cur_features: self._log('\n>>> ITERATION {} <<<'.format(i)) i += 1 cur_features = new_features new_features = self._regrow(cur_features) def select_alpha_features(args): infile = args.infile outfile = args.outfile FeatureSelector(infile, outfile).run() if __name__ == '__main__': wrapper(select_alpha_features, text='select features for alpha classifier')
delta_t = time.time() - start_t self.time_acc += delta_t items_per_min = float(self.items_processed) / float(self.time_acc) items_per_min *= 60. # print('total items: %s' % self.items_processed) # print('items per minute: %s' % items_per_min) self.items_processed += 1 def parse_file(self, filename): lines = file_lines(filename) i = 0 with progressbar.ProgressBar(max_value=lines) as bar: with open(filename, 'r') as f: for line in f: post = json.loads(line) self.parse_post(post) i += 1 bar.update(i) print('main edges created: %s' % self.main_edges) print('extra edges created: %s' % self.extra_edges) def _parse(args): hgraph = hypergraph(args.hg) RedditParser(hgraph).parse_file(args.infile) if __name__ == '__main__': wrapper(_parse, text='reddit parser')
word_after = str(spacy_sentence[i + 1]) pos_after = spacy_sentence[i + 1].pos_ dep_after = spacy_sentence[i + 1].dep_ if spacy_sentence[i + 1].pos_ == 'PUNCT': punct_after = True head = token.head is_root = head is None has_lefts = token.n_lefts > 0 has_rights = token.n_rights > 0 outfile.write(('{}' + '\t{}' * 23 + '\n').format( hedge(atom).type()[0], str(token), token.pos_, token.tag_, token.dep_, str(head) if head else '', head.pos_ if head else '', head.tag_ if head else '', head.dep_ if head else '', is_root, has_lefts, has_rights, token.ent_type_, token.shape_[:2], word_before, word_after, punct_before, punct_after, pos_before, pos_after, dep_before, dep_after, case['correct'], case['source'])) else: failed_parses += 1 print('sentences: {}; ignored: {}; failed: {}; atoms: {}'.format( total_sentences, ignored_sentences, failed_parses, total_atoms)) print('done.') if __name__ == '__main__': wrapper(generate_alpha_training_data, text='generate alpha training data')
print(colored(str(he), 'white')) sentence = line.strip() if sentence not in sentences: sentences.append(sentence) parser_output = parser.parse(sentence) parsed_sentence = parser_output['parses'][0] edge = parsed_sentence['main_edge'] if edge: print('\n{}\n{}\n'.format(sentence, indented(edge))) answer = he.input() if answer == 'd': defects = input_defects(sentence, edge) else: defects = [] he.apply_evaluation(answer, edge, defects) defect_str = '&'.join( [defect.to_str() for defect in defects]) row_str = '\t'.join( (sentence, edge.to_str(), answer, defect_str)) with open(args.outfile, 'a') as of: of.write('{}\n'.format(row_str)) if __name__ == '__main__': wrapper(manual_test, text='manual test of parser')
cases.append(case) random.shuffle(cases) n_cases = len(cases) n_test = int(n_cases / 3) train_cases = cases[n_test:] test_cases = cases[:n_test] file_main_name = infile[:-5] train_file_name = '{}-train.json'.format(file_main_name) test_file_name = '{}-test.json'.format(file_main_name) with open(train_file_name, 'wt') as f: for case in train_cases: f.write('{}\n'.format(json.dumps(case))) with open(test_file_name, 'wt') as f: for case in test_cases: f.write('{}\n'.format(json.dumps(case))) print('{} total cases found'.format(n_cases)) print('wrote {} train cases to {}'.format(len(train_cases), train_file_name)) print('wrote {} test cases to {}'.format(len(test_cases), test_file_name)) if __name__ == '__main__': wrapper(split_parser_training_data, text='split parser training data')
from graphbrain import * from graphbrain.cli import wrapper from graphbrain.parsers import * def update_tests(args): parser = create_parser(name=args.lang, resolve_corefs=False) total = 0 sentence = None with open(args.infile) as f_in: with open(args.outfile, 'w') as f_out: for line in f_in: if sentence: total += 1 parser_output = parser.parse(sentence) parsed_sentence = parser_output['parses'][0] edge = parsed_sentence['main_edge'] f_out.write('{}\n{}\n'.format(sentence, edge.to_str())) sentence = None else: sentence = line.strip() print('Total cases processed: {}.'.format(total)) if __name__ == '__main__': wrapper(update_tests, text='update tests')