def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='seq2tsv') # parser.add_argument() parser.add_argument("pathname") args = parser.parse_args(args) outputPathname = args.pathname + ".tsv" count = 0 start = datetime.datetime.now() with open(outputPathname, 'w') as f: reader = SequenceFile.Reader(args.pathname) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() # reader.sync(4042) position = reader.getPosition() while reader.next(key, value): # print '*' if reader.syncSeen() else ' ', print >> f, '%s\t%s' % (key.toString(), value.toString()) position = reader.getPosition() reader.close() end = datetime.datetime.now() delta = end - start print >> sys.stderr, "ELAPSED seq2tsv is %s" % elapsed(delta) return count
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='tsv2seq') # parser.add_argument() parser.add_argument("pathname") args = parser.parse_args(args) outputPathname = args.pathname + ".seq" writer = SequenceFile.createWriter(outputPathname, Text, Text) count = 0 start = datetime.datetime.now() with open(args.pathname, 'r') as f: print f for line in f: try: (url, payload) = line.split('\t') key = Text() key.set(url) value = Text() # I'm not at all sure why we would want to decode, not encode here # this is the only thing that worked value.set(Text.decode(json.dumps(payload))) writer.append(key, value) count += 1 except ValueError as e: pass writer.close() end = datetime.datetime.now() delta = end - start print >> sys.stderr, "ELAPSED tsv2seq is %s" % elapsed(delta) return count
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='Test Length') # parser.add_argument() args = parser.parse_args(args) lineregex = re.compile(r"""(^.+)\t(.*)""") # specific to first url scheme urlregex = re.compile(r"""https://karmadigstorage.blob.core.windows.net/arch/([a-zA-Z0-9]+)/(\d{8})/.*\.backpage\.com/(.*)""") payload = "" count = 0 total = 0 url = "" for line in sys.stdin: # print line m = lineregex.match(line) if m: url = m.group(1) payload = m.group(2) increment = len(str(payload)) print >> sys.stdout, "%s\t%s" % (url, increment) count += 1 total += increment print >> sys.stdout, "%s\ttotal=%s" % (url, total) print >> sys.stdout, "%s\tcount=%s" % (url, count) print >> sys.stderr, "dig.test.length processed %d records" % count
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='Test Length') # parser.add_argument() args = parser.parse_args(args) lineregex = re.compile(r"""(^.+)\t(.*)""") # specific to first url scheme urlregex = re.compile( r"""https://karmadigstorage.blob.core.windows.net/arch/([a-zA-Z0-9]+)/(\d{8})/.*\.backpage\.com/(.*)""" ) payload = "" count = 0 total = 0 url = "" for line in sys.stdin: # print line m = lineregex.match(line) if m: url = m.group(1) payload = m.group(2) increment = len(str(payload)) print >> sys.stdout, "%s\t%s" % (url, increment) count += 1 total += increment print >> sys.stdout, "%s\ttotal=%s" % (url, total) print >> sys.stdout, "%s\tcount=%s" % (url, count) print >> sys.stderr, "dig.test.length processed %d records" % count
def main(argv=None): '''this is called if run from command line''' start = datetime.datetime.now() (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='azure_publish') parser.add_argument('-d', '--directory', help='directory to publish', required=False, action="append", default=[]) parser.add_argument('-f', '--file', help='file to publish', required=False, action="append", default=[]) parser.add_argument( '-t', '--type', help='content type', required=False, choices=["text/html", "image/jpeg", "image/gif", "image/png"], default="text/html") parser.add_argument('-v', '--verbose', help='print to stderr', required=False, default=VERBOSE) args = parser.parse_args(args) files = args.file directories = args.directory verbose = args.verbose count = 0 for pathname in files: azure_publish_file(pathname, content_type=args.type) count += 1 for directory in directories: for file in os.listdir(directory): azure_publish_file(file, content_type=args.type) count += 1 end = datetime.datetime.now() delta = end - start if verbose: print >> sys.stderr, "ELAPSED azure_publish is %s" % elapsed(delta) print >> sys.stderr, "%d files uploaded" % (count)
def main(argv=None): """this is called if run from command line""" (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description="Token Number Extractor") # parser.add_argument() args = parser.parse_args(args) lineregex = re.compile(r"""(^.+)\t(.*)""") rawText = "" processed = 0 url = None for line in sys.stdin: try: # print line m = lineregex.match(line) if m: url = m.group(1) rawText = m.group(2) post = json.loads(rawText) if isinstance(post, dict): titleText = post.get("titleText") if titleText and titleText.get("content"): tzr = Tokenizer(titleText["content"]) titleText["tokens"] = [t for t in tzr.genTokens()] else: print >>sys.stderr, "No location text for %r" % url locationText = post.get("locationText") if locationText and locationText.get("content"): tzr = Tokenizer(locationText["content"]) locationText["tokens"] = [t for t in tzr.genTokens()] else: print >>sys.stderr, "No title text for %r" % url bodyText = post.get("bodyText") if bodyText and bodyText.get("content"): tzr = Tokenizer(bodyText["content"]) bodyText["tokens"] = [t for t in tzr.genTokens()] else: print >>sys.stderr, "No body text for %r" % url processed += 1 js = json.dumps(post, sort_keys=True, indent=None) print >>sys.stdout, "%s\t%s" % (url, js) except Exception as e: print >>sys.stderr, "dig.extract.entity.digtoken.digtoken Exception [%s]. Last url was [%s]" % ( str(e), url, ) print >>sys.stderr, "dig.extract.entity.digtoken.digtoken processed %d records" % processed
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='Phone Number Extractor') # parser.add_argument() args = parser.parse_args(args) lineregex = re.compile(r"""(^.+)\t(.*)""") rawText = "" for line in sys.stdin: # print line m = lineregex.match(line) if m: url = m.group(1) rawText = m.group(2) post = json.loads(rawText) if isinstance(post, dict): allPhoneNumbers = [] titleText = post.get('titleText') if titleText and titleText.get('content'): extr = PhoneExtractor(titleText['content']) phoneNumbers = extr.extractPhoneNumbers() # store locally with the specific text titleText['phoneNumbers'] = phoneNumbers allPhoneNumbers.extend(phoneNumbers) locationText = post.get('locationText') if locationText and locationText.get('content'): extr = PhoneExtractor(locationText['content']) phoneNumbers = extr.extractPhoneNumbers() # store locally with the specific text locationText['phoneNumbers'] = phoneNumbers allPhoneNumbers.extend(phoneNumbers) bodyText = post.get('bodyText') if bodyText and bodyText.get('content'): extr = PhoneExtractor(bodyText['content']) phoneNumbers = extr.extractPhoneNumbers() # store locally with the specific text bodyText['phoneNumbers'] = phoneNumbers allPhoneNumbers.extend(phoneNumbers) post['phoneNumbers'] = uniqueStable(allPhoneNumbers) js = json.dumps(post, sort_keys=True, indent=None) print >> sys.stdout, "%s\t%s" % (url, js)
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='Token Number Extractor') # parser.add_argument() args = parser.parse_args(args) lineregex = re.compile(r"""(^.+)\t(.*)""") rawText = "" processed = 0 url = None for line in sys.stdin: try: # print line m = lineregex.match(line) if m: url = m.group(1) rawText = m.group(2) post = json.loads(rawText) if isinstance(post, dict): titleText = post.get('titleText') if titleText and titleText.get('content'): tzr = Tokenizer(titleText['content']) titleText['tokens'] = [t for t in tzr.genTokens()] else: print >> sys.stderr, "No location text for %r" % url locationText = post.get('locationText') if locationText and locationText.get('content'): tzr = Tokenizer(locationText['content']) locationText['tokens'] = [t for t in tzr.genTokens()] else: print >> sys.stderr, "No title text for %r" % url bodyText = post.get('bodyText') if bodyText and bodyText.get('content'): tzr = Tokenizer(bodyText['content']) bodyText['tokens'] = [t for t in tzr.genTokens()] else: print >> sys.stderr, "No body text for %r" % url processed += 1 js = json.dumps(post, sort_keys=True, indent=None) print >> sys.stdout, "%s\t%s" % (url, js) except Exception as e: print >> sys.stderr, "dig.extract.entity.digtoken.digtoken Exception [%s]. Last url was [%s]" % ( str(e), url) print >> sys.stderr, "dig.extract.entity.digtoken.digtoken processed %d records" % processed
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='Backpage Extractor') # parser.add_argument() args = parser.parse_args(args) pageCls = BackpagePage lineregex = re.compile(r"""(^.+)\t(.*)""") # specific to first url scheme urlregex = re.compile( r"""https://karmadigstorage.blob.core.windows.net/arch/([a-zA-Z0-9]+)/(\d{8})/.*\.backpage\.com/(.*)""" ) rawText = "" processed = 0 url = None for line in sys.stdin: try: # print line m = lineregex.match(line) if m: url = m.group(1) rawText = m.group(2) urlMatch = urlregex.match(url) if urlMatch: crawlAgent = urlMatch.group(1) datestamp = urlMatch.group(2) tail = urlMatch.group(3) if "index.html" in tail: pass else: pageStr = json.loads(rawText) page = pageCls(url=url, content=pageStr, crawlAgent=crawlAgent, datestamp=int(datestamp)) page.process() processed += 1 # print "%s\t%s" % (url, len(pageStr)) except Exception as e: print >> sys.stderr, "dig.extract.page.backpage Exception [%s]. Last url was [%s]" % ( str(e), url) print >> sys.stderr, "dig.extract.page.backpage processed %d records" % processed
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='Backpage Extractor') # parser.add_argument() args = parser.parse_args(args) pageCls = BackpagePage lineregex = re.compile(r"""(^.+)\t(.*)""") # specific to first url scheme urlregex = re.compile(r"""https://karmadigstorage.blob.core.windows.net/arch/([a-zA-Z0-9]+)/(\d{8})/.*\.backpage\.com/(.*)""") rawText = "" processed = 0 url = None for line in sys.stdin: try: # print line m = lineregex.match(line) if m: url = m.group(1) rawText = m.group(2) urlMatch = urlregex.match(url) if urlMatch: crawlAgent = urlMatch.group(1) datestamp = urlMatch.group(2) tail = urlMatch.group(3) if "index.html" in tail: pass else: pageStr = json.loads(rawText) page = pageCls(url=url, content=pageStr, crawlAgent=crawlAgent, datestamp=int(datestamp)) page.process() processed += 1 # print "%s\t%s" % (url, len(pageStr)) except Exception as e: print >> sys.stderr, "dig.extract.page.backpage Exception [%s]. Last url was [%s]" % (str(e), url) print >> sys.stderr, "dig.extract.page.backpage processed %d records" % processed
def main(argv=None): '''this is called if run from command line''' start = datetime.datetime.now() (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='azure_publish') parser.add_argument('-d', '--directory', help='directory to publish', required=False, action="append", default=[]) parser.add_argument('-f', '--file', help='file to publish', required=False, action="append", default=[]) parser.add_argument('-t', '--type', help='content type', required=False, choices=["text/html", "image/jpeg", "image/gif", "image/png"], default="text/html") parser.add_argument('-v', '--verbose', help='print to stderr', required=False, default=VERBOSE) args = parser.parse_args(args) files = args.file directories = args.directory verbose = args.verbose count = 0 for pathname in files: azure_publish_file(pathname, content_type=args.type) count += 1 for directory in directories: for file in os.listdir(directory): azure_publish_file(file, content_type=args.type) count += 1 end = datetime.datetime.now() delta = end - start if verbose: print >> sys.stderr, "ELAPSED azure_publish is %s" % elapsed(delta) print >> sys.stderr, "%d files uploaded" % (count)
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='db2json') # parser.add_argument("-o") parser.add_argument('-l','--limit', help='num rows to fetch', default=LIMIT) parser.add_argument('-o','--output', help='output file', default=None) parser.add_argument('-q','--query', help='query or query file', default=QUERY) parser.add_argument('-t','--tab', help='drop outer brackets, use tab separators', required=False, action='store_true') parser.add_argument('-v','--verbose', help='verbose output', required=False, action='store_true') args = parser.parse_args(args) try: if os.path.exists(args.query): args.query = open(args.query, 'r').read() except Exception as e: print >> sys.stderr, "Failed to open %s [%r], using as as query" % (args.query, e) if args.verbose: print >> sys.stderr, "output %s" % args.output print >> sys.stderr, "query %s" % args.query print >> sys.stderr, "tab %s" % args.tab if args.output: with open(args.output, "w") as f: db2json(outstream=f, query=args.query, tab=args.tab, limit=args.limit, verbose=args.verbose) else: db2json(sys.stdout, query=args.query, tab=args.tab, limit=args.limit, verbose=args.verbose)
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='Token Number Extractor') # parser.add_argument() args = parser.parse_args(args) lineregex = re.compile(r"""(^.+)\t(.*)""") rawText = "" for line in sys.stdin: # print line m = lineregex.match(line) if m: url = m.group(1) rawText = m.group(2) post = json.loads(rawText) if isinstance(post, dict): titleText = post.get('titleText') if titleText and titleText.get('content'): tzr = Tokenizer(titleText['content']) titleText['tokens'] = [t for t in tzr.genTokens()] else: print >> sys.stderr, "No location text for %r" % url locationText = post.get('locationText') if locationText and locationText.get('content'): tzr = Tokenizer(locationText['content']) locationText['tokens'] = [t for t in tzr.genTokens()] else: print >> sys.stderr, "No title text for %r" % url bodyText = post.get('bodyText') if bodyText and bodyText.get('content'): tzr = Tokenizer(bodyText['content']) bodyText['tokens'] = [t for t in tzr.genTokens()] else: print >> sys.stderr, "No body text for %r" % url js = json.dumps(post, sort_keys=True, indent=None) print >> sys.stdout, "%s\t%s" % (url, js)
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='PatterScanner') parser.add_argument('-c', '--category', help='category to match', required=True, default=DEFAULTCATEGORY) # parser.add_argument() args = parser.parse_args(args) category = args.category lineregex = re.compile(r"""(^.+)\t(.*)""") rawText = "" for line in sys.stdin: # print line m = lineregex.match(line) if m: url = m.group(1) rawText = m.group(2) post = json.loads(rawText) if isinstance(post, dict): titleText = post.get('titleText') if titleText and titleText.get('tokens'): scanner = PatternScanner(titleText['tokens'], category) # arguably we should only add novel matches # implemention would be to represent/convert # patternScanMatch dicts to namedtuples # use a set to uniquify # then convert back to dict to write out as JSON # for now, we will simply append to list titlePatternScanMatches = titleText.get('patternScanMatches') or [] for (phrase, subseqs) in scanner.scan(): for subseq in subseqs: resultJson = {"objectType": "patternScanMatch", "phrasePattern": {"indic": phrase.indic, "category": phrase.category, "family": phrase.family, "tokenRegexPattern": str(phrase.pattern), "weight": phrase.weight}, "tokenSequence": subseq} titlePatternScanMatches.append(resultJson) # is it good practice to record empty results # or should we just not have any such entry titleText['patternScanMatches'] = titlePatternScanMatches locationText = post.get('locationText') if locationText and locationText.get('tokens'): scanner = PatternScanner(locationText['tokens'], category) # arguably we should only add novel matches # implemention would be to represent/convert # patternScanMatch dicts to namedtuples # use a set to uniquify # then convert back to dict to write out as JSON # for now, we will simply append to list locationPatternScanMatches = locationText.get('patternScanMatches') or [] for (phrase, subseqs) in scanner.scan(): for subseq in subseqs: resultJson = {"objectType": "patternScanMatch", "phrasePattern": {"indic": phrase.indic, "category": phrase.category, "family": phrase.family, "tokenRegexPattern": str(phrase.pattern), "weight": phrase.weight}, "tokenSequence": subseq} locationPatternScanMatches.append(resultJson) # is it good practice to record empty results # or should we just not have any such entry locationText['patternScanMatches'] = locationPatternScanMatches bodyText = post.get('bodyText') if bodyText and bodyText.get('tokens'): scanner = PatternScanner(bodyText['tokens'], category) # arguably we should only add novel matches # implemention would be to represent/convert # patternScanMatch dicts to namedtuples # use a set to uniquify # then convert back to dict to write out as JSON # for now, we will simply append to list bodyPatternScanMatches = bodyText.get('patternScanMatches') or [] for (phrase, subseqs) in scanner.scan(): for subseq in subseqs: resultJson = {"objectType": "patternScanMatch", "phrasePattern": {"indic": phrase.indic, "category": phrase.category, "family": phrase.family, "tokenRegexPattern": str(phrase.pattern), "weight": phrase.weight}, "tokenSequence": subseq} bodyPatternScanMatches.append(resultJson) # is it good practice to record empty results # or should we just not have any such entry bodyText['patternScanMatches'] = bodyPatternScanMatches js = json.dumps(post, sort_keys=True, indent=None) print >> sys.stdout, "%s\t%s" % (url, js)
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='PatterScanner') parser.add_argument('-c', '--category', help='category(ies) to match', required=True, action="append", default=[DEFAULTCATEGORY]) # parser.add_argument() args = parser.parse_args(args) categories = args.category lineregex = re.compile(r"""(^.+)\t(.*)""") rawText = "" url = None processed = 0 for line in sys.stdin: try: # print line m = lineregex.match(line) if m: url = m.group(1) rawText = m.group(2) post = json.loads(rawText) if isinstance(post, dict): for category in categories: titleText = post.get('titleText') if titleText and titleText.get('tokens'): scanner = PatternScanner(titleText['tokens'], category) # arguably we should only add novel matches # implemention would be to represent/convert # patternScanMatch dicts to namedtuples # use a set to uniquify # then convert back to dict to write out as JSON # for now, we will simply append to list titlePatternScanMatches = titleText.get('patternScanMatches') or [] for (phrase, subseqs) in scanner.scan(): for subseq in subseqs: resultJson = {"objectType": "patternScanMatch", "phrasePattern": {"indic": phrase.indic, "category": phrase.category, "family": phrase.family, "tokenRegexPattern": str(phrase.pattern), "weight": phrase.weight}, "tokenSequence": subseq} titlePatternScanMatches.append(resultJson) # is it good practice to record empty results # or should we just not have any such entry titleText['patternScanMatches'] = titlePatternScanMatches locationText = post.get('locationText') if locationText and locationText.get('tokens'): scanner = PatternScanner(locationText['tokens'], category) # arguably we should only add novel matches # implemention would be to represent/convert # patternScanMatch dicts to namedtuples # use a set to uniquify # then convert back to dict to write out as JSON # for now, we will simply append to list locationPatternScanMatches = locationText.get('patternScanMatches') or [] for (phrase, subseqs) in scanner.scan(): for subseq in subseqs: resultJson = {"objectType": "patternScanMatch", "phrasePattern": {"indic": phrase.indic, "category": phrase.category, "family": phrase.family, "tokenRegexPattern": str(phrase.pattern), "weight": phrase.weight}, "tokenSequence": subseq} locationPatternScanMatches.append(resultJson) # is it good practice to record empty results # or should we just not have any such entry locationText['patternScanMatches'] = locationPatternScanMatches bodyText = post.get('bodyText') if bodyText and bodyText.get('tokens'): scanner = PatternScanner(bodyText['tokens'], category) # arguably we should only add novel matches # implemention would be to represent/convert # patternScanMatch dicts to namedtuples # use a set to uniquify # then convert back to dict to write out as JSON # for now, we will simply append to list bodyPatternScanMatches = bodyText.get('patternScanMatches') or [] for (phrase, subseqs) in scanner.scan(): for subseq in subseqs: resultJson = {"objectType": "patternScanMatch", "phrasePattern": {"indic": phrase.indic, "category": phrase.category, "family": phrase.family, "tokenRegexPattern": str(phrase.pattern), "weight": phrase.weight}, "tokenSequence": subseq} bodyPatternScanMatches.append(resultJson) # is it good practice to record empty results # or should we just not have any such entry bodyText['patternScanMatches'] = bodyPatternScanMatches processed += 1 js = json.dumps(post, sort_keys=True, indent=None) print >> sys.stdout, "%s\t%s" % (url, js) except Exception as e: print >> sys.stderr, "dig.extract.entity.classifier.patscan Exception [%s]. Last url was [%s]" % (str(e), url) print >> sys.stderr, "dig.extract.entity.classifier.patscan processed %d records" % processed