Exemplo n.º 1
0
def main(argv=None):
    '''this is called if run from command line'''
    parser = argparse.ArgumentParser()
    parser.add_argument('input')
    parser.add_argument('-c','--category', required=False, 
                        help='major category of rules to apply (default: use all)')
    parser.add_argument('-f','--family', required=False,
                        help='minor category of rules to apply (default: use all)')
    parser.add_argument('-i','--indicator', required=False,
                        help='Indicate precise rule as X.Y.Z')
    parser.add_argument('-t','--type', required=False, default='text',
                        help='input file type', choices=('text', 'html'))
    parser.add_argument('-v','--verbose', required=False, help='verbose', action='store_true')
    args=parser.parse_args()

    with codecs.open(args.input, 'r', encoding='utf-8') as f:
        text = f.read()
        if args.type == 'html':
            from pymod.htmlextract import extract_text
            text = extract_text(text)

    tok = Tokenizer(text)
    tokens = [t for t in tok.genTokens()]
    result = patternScan(tokens, category=args.category, 
                         family=args.family, indicator=args.indicator)
    print >> sys.stdout, json.dumps(result, indent=4)
Exemplo n.º 2
0
def main(argv=None):
    '''this is called if run from command line'''
    parser = argparse.ArgumentParser()
    parser.add_argument('input')
    parser.add_argument(
        '-c',
        '--category',
        required=False,
        help='major category of rules to apply (default: use all)')
    parser.add_argument(
        '-f',
        '--family',
        required=False,
        help='minor category of rules to apply (default: use all)')
    parser.add_argument('-i',
                        '--indicator',
                        required=False,
                        help='Indicate precise rule as X.Y.Z')
    parser.add_argument('-t',
                        '--type',
                        required=False,
                        default='text',
                        help='input file type',
                        choices=('text', 'html'))
    parser.add_argument('-v',
                        '--verbose',
                        required=False,
                        help='verbose',
                        action='store_true')
    args = parser.parse_args()

    with codecs.open(args.input, 'r', encoding='utf-8') as f:
        text = f.read()
        if args.type == 'html':
            from pymod.htmlextract import extract_text
            text = extract_text(text)

    tok = Tokenizer(text)
    tokens = [t for t in tok.genTokens()]
    result = patternScan(tokens,
                         category=args.category,
                         family=args.family,
                         indicator=args.indicator)
    print >> sys.stdout, json.dumps(result, indent=4)
Exemplo n.º 3
0
    def applyClassifier(self, input):
        classifierName = self.positiveClass
        indicator = self.indicator
        if input == '-':
            # special case, read from stdin
            input = sys.stdin.read()

        text = input

        if self.inputType == 'html':
            from pymod.htmlextract import extract_text
            text = extract_text(text)

        with timeblock("applying %s %s classifier" % (classifierName, indicator), self.verbose):
            prob_dist = self.classifier.prob_classify(text)
            result = {"input": input,
                      "class": self.positiveClass,
                      "prob": prob_dist.prob("pos")}
            return result