def main(args): argp = ARGPARSER.parse_args(args[1:]) if not argp.no_cache: # We can't do it iteratively listening to stdin, read it all doc = Document('<classify>', [], [], '<classify>') for _string in (l.rstrip('\n') for l in argp.input): doc.abstract.append(_string_to_ann_sent(_string)) docs = (doc, ) else: docs = (Document('Line: %s' % i, [], [_string_to_ann_sent(_string)], '<stdin>') for i, _string in enumerate( (l.rstrip('\n') for l in argp.input), start=1)) # Cache the strings for speed if not argp.no_cache: cache_simstring((docs, ), verbose=argp.verbose) with open(argp.model_path, 'r') as model_file: classifier = pickle_load(model_file) # TODO: Faster to do it in a batch instead for doc in docs: for sent in doc: for ann in sent: print '%s\t%s' % (sent.annotation_text(ann), str(classifier.classify(doc, sent, ann, ranked=True)))
def main(args): argp = ARGPARSER.parse_args(args[1:]) # Create a dataset out of the input doc = _tab_separated_input_to_doc(argp.input) # Cache the strings for speed cache_simstring(((doc, ), ), verbose=argp.verbose) classifier = SimStringInternalClassifier() classifier.train((doc, )) with open(argp.model_path, 'w') as model_file: pickle_dump(classifier, model_file)