Пример #1
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='seq2tsv')
    # parser.add_argument()
    parser.add_argument("pathname")
    args = parser.parse_args(args)
    outputPathname = args.pathname + ".tsv"
    count = 0
    start = datetime.datetime.now()
    with open(outputPathname, 'w') as f:
        reader = SequenceFile.Reader(args.pathname)

        key_class = reader.getKeyClass()
        value_class = reader.getValueClass()

        key = key_class()
        value = value_class()

        # reader.sync(4042)
        position = reader.getPosition()
        while reader.next(key, value):
            # print '*' if reader.syncSeen() else ' ',
            print >> f, '%s\t%s' % (key.toString(), value.toString())
            position = reader.getPosition()

        reader.close()
    end = datetime.datetime.now()
    delta = end - start
    print >> sys.stderr, "ELAPSED seq2tsv is %s" % elapsed(delta)
    return count
Пример #2
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='seq2tsv')
    # parser.add_argument()
    parser.add_argument("pathname")
    args = parser.parse_args(args)
    outputPathname = args.pathname + ".tsv"
    count = 0
    start = datetime.datetime.now()
    with open(outputPathname, 'w') as f:
        reader = SequenceFile.Reader(args.pathname)

        key_class = reader.getKeyClass()
        value_class = reader.getValueClass()

        key = key_class()
        value = value_class()

        # reader.sync(4042)
        position = reader.getPosition()
        while reader.next(key, value):
            # print '*' if reader.syncSeen() else ' ',
            print >> f, '%s\t%s' % (key.toString(), value.toString())
            position = reader.getPosition()

        reader.close()
    end = datetime.datetime.now()
    delta = end - start
    print >> sys.stderr, "ELAPSED seq2tsv is %s" % elapsed(delta)
    return count
Пример #3
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='tsv2seq')
    # parser.add_argument()
    parser.add_argument("pathname")
    args = parser.parse_args(args)
    
    outputPathname = args.pathname + ".seq"
    writer = SequenceFile.createWriter(outputPathname, Text, Text)
    count = 0
    start = datetime.datetime.now()
    with open(args.pathname, 'r') as f:
        print f
        for line in f:
            try:
                (url, payload) = line.split('\t')
                key = Text()
                key.set(url)
                value = Text()
                # I'm not at all sure why we would want to decode, not encode here
                # this is the only thing that worked
                value.set(Text.decode(json.dumps(payload)))
                writer.append(key, value)
                count += 1
            except ValueError as e:
                pass
    writer.close()
    end = datetime.datetime.now()
    delta = end - start
    print >> sys.stderr, "ELAPSED tsv2seq is %s" % elapsed(delta)
    return count
Пример #4
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='Test Length')
    # parser.add_argument()
    args = parser.parse_args(args)
    
    lineregex = re.compile(r"""(^.+)\t(.*)""")
    # specific to first url scheme
    urlregex = re.compile(r"""https://karmadigstorage.blob.core.windows.net/arch/([a-zA-Z0-9]+)/(\d{8})/.*\.backpage\.com/(.*)""")
    payload = ""
    count = 0
    total = 0
    url = ""
    for line in sys.stdin:
        # print line
        m = lineregex.match(line) 
        if m:
            url = m.group(1)
            payload = m.group(2)
            increment = len(str(payload))
            print >> sys.stdout, "%s\t%s" % (url, increment)
            count += 1
            total += increment
    print >> sys.stdout, "%s\ttotal=%s" % (url, total)
    print >> sys.stdout, "%s\tcount=%s" % (url, count)
    print >> sys.stderr, "dig.test.length processed %d records" % count
Пример #5
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='Test Length')
    # parser.add_argument()
    args = parser.parse_args(args)

    lineregex = re.compile(r"""(^.+)\t(.*)""")
    # specific to first url scheme
    urlregex = re.compile(
        r"""https://karmadigstorage.blob.core.windows.net/arch/([a-zA-Z0-9]+)/(\d{8})/.*\.backpage\.com/(.*)"""
    )
    payload = ""
    count = 0
    total = 0
    url = ""
    for line in sys.stdin:
        # print line
        m = lineregex.match(line)
        if m:
            url = m.group(1)
            payload = m.group(2)
            increment = len(str(payload))
            print >> sys.stdout, "%s\t%s" % (url, increment)
            count += 1
            total += increment
    print >> sys.stdout, "%s\ttotal=%s" % (url, total)
    print >> sys.stdout, "%s\tcount=%s" % (url, count)
    print >> sys.stderr, "dig.test.length processed %d records" % count
Пример #6
0
def main(argv=None):
    '''this is called if run from command line'''
    start = datetime.datetime.now()
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='azure_publish')
    parser.add_argument('-d',
                        '--directory',
                        help='directory to publish',
                        required=False,
                        action="append",
                        default=[])
    parser.add_argument('-f',
                        '--file',
                        help='file to publish',
                        required=False,
                        action="append",
                        default=[])
    parser.add_argument(
        '-t',
        '--type',
        help='content type',
        required=False,
        choices=["text/html", "image/jpeg", "image/gif", "image/png"],
        default="text/html")
    parser.add_argument('-v',
                        '--verbose',
                        help='print to stderr',
                        required=False,
                        default=VERBOSE)

    args = parser.parse_args(args)
    files = args.file
    directories = args.directory
    verbose = args.verbose
    count = 0
    for pathname in files:
        azure_publish_file(pathname, content_type=args.type)
        count += 1
    for directory in directories:
        for file in os.listdir(directory):
            azure_publish_file(file, content_type=args.type)
            count += 1
    end = datetime.datetime.now()
    delta = end - start
    if verbose:
        print >> sys.stderr, "ELAPSED azure_publish is %s" % elapsed(delta)
        print >> sys.stderr, "%d files uploaded" % (count)
Пример #7
0
def main(argv=None):
    """this is called if run from command line"""
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description="Token Number Extractor")
    # parser.add_argument()
    args = parser.parse_args(args)

    lineregex = re.compile(r"""(^.+)\t(.*)""")
    rawText = ""
    processed = 0
    url = None
    for line in sys.stdin:
        try:
            # print line
            m = lineregex.match(line)
            if m:
                url = m.group(1)
                rawText = m.group(2)
                post = json.loads(rawText)
                if isinstance(post, dict):
                    titleText = post.get("titleText")
                    if titleText and titleText.get("content"):
                        tzr = Tokenizer(titleText["content"])
                        titleText["tokens"] = [t for t in tzr.genTokens()]
                    else:
                        print >>sys.stderr, "No location text for %r" % url
                    locationText = post.get("locationText")
                    if locationText and locationText.get("content"):
                        tzr = Tokenizer(locationText["content"])
                        locationText["tokens"] = [t for t in tzr.genTokens()]
                    else:
                        print >>sys.stderr, "No title text for %r" % url
                    bodyText = post.get("bodyText")
                    if bodyText and bodyText.get("content"):
                        tzr = Tokenizer(bodyText["content"])
                        bodyText["tokens"] = [t for t in tzr.genTokens()]
                    else:
                        print >>sys.stderr, "No body text for %r" % url
                processed += 1
                js = json.dumps(post, sort_keys=True, indent=None)
                print >>sys.stdout, "%s\t%s" % (url, js)
        except Exception as e:
            print >>sys.stderr, "dig.extract.entity.digtoken.digtoken Exception [%s].  Last url was [%s]" % (
                str(e),
                url,
            )
    print >>sys.stderr, "dig.extract.entity.digtoken.digtoken processed %d records" % processed
Пример #8
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='Phone Number Extractor')
    # parser.add_argument()
    args = parser.parse_args(args)
    
    lineregex = re.compile(r"""(^.+)\t(.*)""")
    rawText = ""
    for line in sys.stdin:
        # print line
        m = lineregex.match(line) 
        if m:
            url = m.group(1)
            rawText = m.group(2)
            post = json.loads(rawText)
            if isinstance(post, dict):
                allPhoneNumbers = []

                titleText = post.get('titleText')
                if titleText and titleText.get('content'):
                    extr = PhoneExtractor(titleText['content'])
                    phoneNumbers = extr.extractPhoneNumbers()
                    # store locally with the specific text
                    titleText['phoneNumbers'] = phoneNumbers
                    allPhoneNumbers.extend(phoneNumbers)

                locationText = post.get('locationText')
                if locationText and locationText.get('content'):
                    extr = PhoneExtractor(locationText['content'])
                    phoneNumbers = extr.extractPhoneNumbers()
                    # store locally with the specific text
                    locationText['phoneNumbers'] = phoneNumbers
                    allPhoneNumbers.extend(phoneNumbers)

                bodyText = post.get('bodyText')
                if bodyText and bodyText.get('content'):
                    extr = PhoneExtractor(bodyText['content'])
                    phoneNumbers = extr.extractPhoneNumbers()
                    # store locally with the specific text
                    bodyText['phoneNumbers'] = phoneNumbers
                    allPhoneNumbers.extend(phoneNumbers)
            
                post['phoneNumbers'] = uniqueStable(allPhoneNumbers)

            js = json.dumps(post, sort_keys=True, indent=None)
            print >> sys.stdout, "%s\t%s" % (url, js)
Пример #9
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog,
                                     description='Token Number Extractor')
    # parser.add_argument()
    args = parser.parse_args(args)

    lineregex = re.compile(r"""(^.+)\t(.*)""")
    rawText = ""
    processed = 0
    url = None
    for line in sys.stdin:
        try:
            # print line
            m = lineregex.match(line)
            if m:
                url = m.group(1)
                rawText = m.group(2)
                post = json.loads(rawText)
                if isinstance(post, dict):
                    titleText = post.get('titleText')
                    if titleText and titleText.get('content'):
                        tzr = Tokenizer(titleText['content'])
                        titleText['tokens'] = [t for t in tzr.genTokens()]
                    else:
                        print >> sys.stderr, "No location text for %r" % url
                    locationText = post.get('locationText')
                    if locationText and locationText.get('content'):
                        tzr = Tokenizer(locationText['content'])
                        locationText['tokens'] = [t for t in tzr.genTokens()]
                    else:
                        print >> sys.stderr, "No title text for %r" % url
                    bodyText = post.get('bodyText')
                    if bodyText and bodyText.get('content'):
                        tzr = Tokenizer(bodyText['content'])
                        bodyText['tokens'] = [t for t in tzr.genTokens()]
                    else:
                        print >> sys.stderr, "No body text for %r" % url
                processed += 1
                js = json.dumps(post, sort_keys=True, indent=None)
                print >> sys.stdout, "%s\t%s" % (url, js)
        except Exception as e:
            print >> sys.stderr, "dig.extract.entity.digtoken.digtoken Exception [%s].  Last url was [%s]" % (
                str(e), url)
    print >> sys.stderr, "dig.extract.entity.digtoken.digtoken processed %d records" % processed
Пример #10
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='Backpage Extractor')
    # parser.add_argument()
    args = parser.parse_args(args)

    pageCls = BackpagePage
    lineregex = re.compile(r"""(^.+)\t(.*)""")
    # specific to first url scheme
    urlregex = re.compile(
        r"""https://karmadigstorage.blob.core.windows.net/arch/([a-zA-Z0-9]+)/(\d{8})/.*\.backpage\.com/(.*)"""
    )
    rawText = ""
    processed = 0
    url = None
    for line in sys.stdin:
        try:
            # print line
            m = lineregex.match(line)
            if m:
                url = m.group(1)
                rawText = m.group(2)
                urlMatch = urlregex.match(url)
                if urlMatch:
                    crawlAgent = urlMatch.group(1)
                    datestamp = urlMatch.group(2)
                    tail = urlMatch.group(3)
                    if "index.html" in tail:
                        pass
                    else:
                        pageStr = json.loads(rawText)
                        page = pageCls(url=url,
                                       content=pageStr,
                                       crawlAgent=crawlAgent,
                                       datestamp=int(datestamp))
                        page.process()
                        processed += 1
                        # print "%s\t%s" % (url, len(pageStr))
        except Exception as e:
            print >> sys.stderr, "dig.extract.page.backpage Exception [%s].  Last url was [%s]" % (
                str(e), url)
    print >> sys.stderr, "dig.extract.page.backpage processed %d records" % processed
Пример #11
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='Backpage Extractor')
    # parser.add_argument()
    args = parser.parse_args(args)
    
    pageCls = BackpagePage
    lineregex = re.compile(r"""(^.+)\t(.*)""")
    # specific to first url scheme
    urlregex = re.compile(r"""https://karmadigstorage.blob.core.windows.net/arch/([a-zA-Z0-9]+)/(\d{8})/.*\.backpage\.com/(.*)""")
    rawText = ""
    processed = 0
    url = None
    for line in sys.stdin:
        try:
            # print line
            m = lineregex.match(line) 
            if m:
                url = m.group(1)
                rawText = m.group(2)
                urlMatch = urlregex.match(url)
                if urlMatch:
                    crawlAgent = urlMatch.group(1)
                    datestamp = urlMatch.group(2)
                    tail = urlMatch.group(3)
                    if "index.html" in tail:
                        pass
                    else:
                        pageStr = json.loads(rawText)
                        page = pageCls(url=url,
                                       content=pageStr,
                                       crawlAgent=crawlAgent,
                                       datestamp=int(datestamp))
                        page.process()
                        processed += 1
                        # print "%s\t%s" % (url, len(pageStr))
        except Exception as e:
            print >> sys.stderr, "dig.extract.page.backpage Exception [%s].  Last url was [%s]" % (str(e), url)
    print >> sys.stderr, "dig.extract.page.backpage processed %d records" % processed
Пример #12
0
def main(argv=None):
    '''this is called if run from command line'''
    start = datetime.datetime.now()
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='azure_publish')
    parser.add_argument('-d', '--directory', help='directory to publish', 
                        required=False, 
                        action="append",
                        default=[])
    parser.add_argument('-f', '--file', help='file to publish', 
                        required=False, 
                        action="append",
                        default=[])
    parser.add_argument('-t', '--type', help='content type', 
                        required=False,
                        choices=["text/html", "image/jpeg", "image/gif", "image/png"],
                        default="text/html")
    parser.add_argument('-v', '--verbose', help='print to stderr',
                        required=False,
                        default=VERBOSE)

    args = parser.parse_args(args)
    files = args.file
    directories = args.directory
    verbose = args.verbose
    count = 0
    for pathname in files:
        azure_publish_file(pathname, content_type=args.type)
        count += 1
    for directory in directories:
        for file in os.listdir(directory):
            azure_publish_file(file, content_type=args.type)
            count += 1
    end = datetime.datetime.now()
    delta = end - start
    if verbose:
        print >> sys.stderr, "ELAPSED azure_publish is %s" % elapsed(delta)
        print >> sys.stderr, "%d files uploaded" % (count)
Пример #13
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='db2json')
    # parser.add_argument("-o")
    parser.add_argument('-l','--limit', 
                        help='num rows to fetch', 
                        default=LIMIT)
    parser.add_argument('-o','--output', 
                        help='output file', 
                        default=None)
    parser.add_argument('-q','--query', 
                        help='query or query file', 
                        default=QUERY)
    parser.add_argument('-t','--tab', 
                        help='drop outer brackets, use tab separators', 
                        required=False, 
                        action='store_true')
    parser.add_argument('-v','--verbose', 
                        help='verbose output',
                        required=False, 
                        action='store_true')

    args = parser.parse_args(args)
    try:
        if os.path.exists(args.query):
            args.query = open(args.query, 'r').read()
    except Exception as e:
        print >> sys.stderr, "Failed to open %s [%r], using as as query" % (args.query, e)
    if args.verbose:
        print >> sys.stderr, "output %s" % args.output
        print >> sys.stderr, "query %s" % args.query
        print >> sys.stderr, "tab %s" % args.tab
    if args.output:
        with open(args.output, "w") as f:
            db2json(outstream=f, query=args.query, tab=args.tab, limit=args.limit, verbose=args.verbose)
    else:
        db2json(sys.stdout, query=args.query, tab=args.tab, limit=args.limit, verbose=args.verbose)
Пример #14
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='Token Number Extractor')
    # parser.add_argument()
    args = parser.parse_args(args)
    
    lineregex = re.compile(r"""(^.+)\t(.*)""")
    rawText = ""
    for line in sys.stdin:
        # print line
        m = lineregex.match(line) 
        if m:
            url = m.group(1)
            rawText = m.group(2)
            post = json.loads(rawText)
            if isinstance(post, dict):
                titleText = post.get('titleText')
                if titleText and titleText.get('content'):
                    tzr = Tokenizer(titleText['content'])
                    titleText['tokens'] = [t for t in tzr.genTokens()]
                else:
                    print >> sys.stderr, "No location text for %r" % url
                locationText = post.get('locationText')
                if locationText and locationText.get('content'):
                    tzr = Tokenizer(locationText['content'])
                    locationText['tokens'] = [t for t in tzr.genTokens()]
                else:
                    print >> sys.stderr, "No title text for %r" % url
                bodyText = post.get('bodyText')
                if bodyText and bodyText.get('content'):
                    tzr = Tokenizer(bodyText['content'])
                    bodyText['tokens'] = [t for t in tzr.genTokens()]
                else:
                    print >> sys.stderr, "No body text for %r" % url
            js = json.dumps(post, sort_keys=True, indent=None)
            print >> sys.stdout, "%s\t%s" % (url, js)
Пример #15
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='PatterScanner')
    parser.add_argument('-c', '--category', help='category to match', 
                        required=True, 
                        default=DEFAULTCATEGORY)
    # parser.add_argument()
    args = parser.parse_args(args)
    category = args.category
    
    lineregex = re.compile(r"""(^.+)\t(.*)""")
    rawText = ""
    for line in sys.stdin:
        # print line
        m = lineregex.match(line) 
        if m:
            url = m.group(1)
            rawText = m.group(2)
            post = json.loads(rawText)
            if isinstance(post, dict):
                titleText = post.get('titleText')
                if titleText and titleText.get('tokens'):
                    scanner = PatternScanner(titleText['tokens'], category)
                    # arguably we should only add novel matches
                    # implemention would be to represent/convert 
                    # patternScanMatch dicts to namedtuples
                    # use a set to uniquify
                    # then convert back to dict to write out as JSON
                    # for now, we will simply append to list
                    titlePatternScanMatches = titleText.get('patternScanMatches') or []
                    for (phrase, subseqs) in scanner.scan():
                        for subseq in subseqs:
                            resultJson = {"objectType": "patternScanMatch",
                                          "phrasePattern":
                                              {"indic": phrase.indic,
                                               "category": phrase.category,
                                               "family": phrase.family,
                                               "tokenRegexPattern": str(phrase.pattern),
                                               "weight": phrase.weight},
                                          "tokenSequence": subseq}
                            titlePatternScanMatches.append(resultJson)
                    # is it good practice to record empty results
                    # or should we just not have any such entry
                    titleText['patternScanMatches'] = titlePatternScanMatches
                locationText = post.get('locationText')
                if locationText and locationText.get('tokens'):
                    scanner = PatternScanner(locationText['tokens'], category)
                    # arguably we should only add novel matches
                    # implemention would be to represent/convert 
                    # patternScanMatch dicts to namedtuples
                    # use a set to uniquify
                    # then convert back to dict to write out as JSON
                    # for now, we will simply append to list
                    locationPatternScanMatches = locationText.get('patternScanMatches') or []
                    for (phrase, subseqs) in scanner.scan():
                        for subseq in subseqs:
                            resultJson = {"objectType": "patternScanMatch",
                                          "phrasePattern":
                                              {"indic": phrase.indic,
                                               "category": phrase.category,
                                               "family": phrase.family,
                                               "tokenRegexPattern": str(phrase.pattern),
                                               "weight": phrase.weight},
                                          "tokenSequence": subseq}
                            locationPatternScanMatches.append(resultJson)
                    # is it good practice to record empty results
                    # or should we just not have any such entry
                    locationText['patternScanMatches'] = locationPatternScanMatches
                bodyText = post.get('bodyText')
                if bodyText and bodyText.get('tokens'):
                    scanner = PatternScanner(bodyText['tokens'], category)
                    # arguably we should only add novel matches
                    # implemention would be to represent/convert 
                    # patternScanMatch dicts to namedtuples
                    # use a set to uniquify
                    # then convert back to dict to write out as JSON
                    # for now, we will simply append to list
                    bodyPatternScanMatches = bodyText.get('patternScanMatches') or []
                    for (phrase, subseqs) in scanner.scan():
                        for subseq in subseqs:
                            resultJson = {"objectType": "patternScanMatch",
                                          "phrasePattern":
                                              {"indic": phrase.indic,
                                               "category": phrase.category,
                                               "family": phrase.family,
                                               "tokenRegexPattern": str(phrase.pattern),
                                               "weight": phrase.weight},
                                          "tokenSequence": subseq}
                            bodyPatternScanMatches.append(resultJson)
                    # is it good practice to record empty results
                    # or should we just not have any such entry
                    bodyText['patternScanMatches'] = bodyPatternScanMatches

            js = json.dumps(post, sort_keys=True, indent=None)
            print >> sys.stdout, "%s\t%s" % (url, js)
Пример #16
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='PatterScanner')
    parser.add_argument('-c', '--category', help='category(ies) to match', 
                        required=True, 
                        action="append",
                        default=[DEFAULTCATEGORY])
    # parser.add_argument()
    args = parser.parse_args(args)
    categories = args.category
    
    lineregex = re.compile(r"""(^.+)\t(.*)""")
    rawText = ""
    url = None
    processed = 0
    for line in sys.stdin:
        try:
            # print line
            m = lineregex.match(line) 
            if m:
                url = m.group(1)
                rawText = m.group(2)
                post = json.loads(rawText)
                if isinstance(post, dict):
                    for category in categories:
                        titleText = post.get('titleText')
                        if titleText and titleText.get('tokens'):
                            scanner = PatternScanner(titleText['tokens'], category)
                            # arguably we should only add novel matches
                            # implemention would be to represent/convert 
                            # patternScanMatch dicts to namedtuples
                            # use a set to uniquify
                            # then convert back to dict to write out as JSON
                            # for now, we will simply append to list
                            titlePatternScanMatches = titleText.get('patternScanMatches') or []
                            for (phrase, subseqs) in scanner.scan():
                                for subseq in subseqs:
                                    resultJson = {"objectType": "patternScanMatch",
                                                  "phrasePattern":
                                                      {"indic": phrase.indic,
                                                       "category": phrase.category,
                                                       "family": phrase.family,
                                                       "tokenRegexPattern": str(phrase.pattern),
                                                       "weight": phrase.weight},
                                                  "tokenSequence": subseq}
                                    titlePatternScanMatches.append(resultJson)
                            # is it good practice to record empty results
                            # or should we just not have any such entry
                            titleText['patternScanMatches'] = titlePatternScanMatches
                        locationText = post.get('locationText')
                        if locationText and locationText.get('tokens'):
                            scanner = PatternScanner(locationText['tokens'], category)
                            # arguably we should only add novel matches
                            # implemention would be to represent/convert 
                            # patternScanMatch dicts to namedtuples
                            # use a set to uniquify
                            # then convert back to dict to write out as JSON
                            # for now, we will simply append to list
                            locationPatternScanMatches = locationText.get('patternScanMatches') or []
                            for (phrase, subseqs) in scanner.scan():
                                for subseq in subseqs:
                                    resultJson = {"objectType": "patternScanMatch",
                                                  "phrasePattern":
                                                      {"indic": phrase.indic,
                                                       "category": phrase.category,
                                                       "family": phrase.family,
                                                       "tokenRegexPattern": str(phrase.pattern),
                                                       "weight": phrase.weight},
                                                  "tokenSequence": subseq}
                                    locationPatternScanMatches.append(resultJson)
                            # is it good practice to record empty results
                            # or should we just not have any such entry
                            locationText['patternScanMatches'] = locationPatternScanMatches
                        bodyText = post.get('bodyText')
                        if bodyText and bodyText.get('tokens'):
                            scanner = PatternScanner(bodyText['tokens'], category)
                            # arguably we should only add novel matches
                            # implemention would be to represent/convert 
                            # patternScanMatch dicts to namedtuples
                            # use a set to uniquify
                            # then convert back to dict to write out as JSON
                            # for now, we will simply append to list
                            bodyPatternScanMatches = bodyText.get('patternScanMatches') or []
                            for (phrase, subseqs) in scanner.scan():
                                for subseq in subseqs:
                                    resultJson = {"objectType": "patternScanMatch",
                                                  "phrasePattern":
                                                      {"indic": phrase.indic,
                                                       "category": phrase.category,
                                                       "family": phrase.family,
                                                       "tokenRegexPattern": str(phrase.pattern),
                                                       "weight": phrase.weight},
                                                  "tokenSequence": subseq}
                                    bodyPatternScanMatches.append(resultJson)
                            # is it good practice to record empty results
                            # or should we just not have any such entry
                            bodyText['patternScanMatches'] = bodyPatternScanMatches

            processed += 1
            js = json.dumps(post, sort_keys=True, indent=None)
            print >> sys.stdout, "%s\t%s" % (url, js)

        except Exception as e:
            print >> sys.stderr, "dig.extract.entity.classifier.patscan Exception [%s].  Last url was [%s]" % (str(e), url)
    print >> sys.stderr, "dig.extract.entity.classifier.patscan processed %d records" % processed