def basicTokenizer(task): task_tag = "NLP ADDRESS PARSER" print "\n\n************** %s [START] ******************\n" % task_tag print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id task.setStatus(412) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return txt = None if hasattr(task, "txt_file"): txt = doc.loadFile(task.txt_file) else: import os try: txt_path = doc.getAssetsByTagName(ASSET_TAGS['TXT_JSON'])[0]['file_name'] txt = doc.loadFile(os.path.join(doc.base_path, txt_path)) except Exception as e: if DEBUG: print e if txt is None: print "TEXT FILE IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return
def get_vector(uv_task): task_tag = "IMAGE: GETTING VECTOR" print "\n\n************** %s [START] ******************\n" % task_tag uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from vars import ASSET_TAGS from conf import ANNEX_DIR, DEBUG import os, pypuzzle image = UnveillanceDocument(_id=uv_task.doc_id) hi_res = image.getAssetsByTagName(ASSET_TAGS['HIGH']) if hi_res is None: error_msg = "Could not find the hi-res clone" print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return hi_res = os.path.join(ANNEX_DIR, image.base_path, hi_res[0]['file_name']) puzz = pypuzzle.Puzzle() if DEBUG: print "generate puzzle vector from %s" % hi_res try: cvec = puzz.get_cvec_from_file(hi_res) except Exception as e: error_msg = "Could not get image vector because %s" % e print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return if not image.addAsset(cvec, "image_cvec.json", as_literal=False, tags=[ASSET_TAGS['IMAGE_CVEC']]): error_msg = "could not save cvec asset!" print error_msg print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail(message=error_msg) return print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()
def basicTokenizer(task): task_tag = "NLP TOKENIZER" print "\n\n************** %s [START] ******************\n" % task_tag print "TOKENIZING TEXT DOCUMENT at %s" % task.doc_id task.setStatus(412) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return txt = None from json import loads if hasattr(task, "txt_file"): txt = loads(doc.loadFile(task.txt_file)) else: import os try: txt_path = doc.getAssetsByTagName(ASSET_TAGS['TXT_JSON'])[0]['file_name'] txt = loads(doc.loadFile(os.path.join(doc.base_path, txt_path))) except Exception as e: if DEBUG: print e if txt is None: print "TEXT FILE IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag return from lib.Worker.Models.cp_nlp_server import CompassNLPServer nlp_server = CompassNLPServer() tokenized = nlp_server.sendNLPRequest({ 'method' : 'tokenize', 'txt' : txt }) if tokenized is None: print "COULD NOT TOKENIZE." print "\n\n************** %s [ERROR] ******************\n" % task_tag return if DEBUG: print "here is res" print type(tokenized) asset_path = doc.addAsset(tokenized, "core_nlp_tokenized.json", as_literal=False, description="tokenized output from Stanford Core NLP", tags=[ASSET_TAGS['TOKENS_NLP']]) if asset_path is None or not doc.addFile(asset_path, None, sync=True): print "COULD NOT SAVE ASSET." print "\n\n************** %s [ERROR] ******************\n" % task_tag return doc.addCompletedTask(task.task_path) task.finish() print "\n\n************** %s [END] ******************\n" % task_tag
def addressParser(task): task_tag = "NLP ADDRESS PARSER" print "\n\n************** %s [START] ******************\n" % task_tag print "EXTRACTING ADDRESSES FROM TEXT DOCUMENT at %s" % task.doc_id task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return txt = None if hasattr(task, "txt_file"): txt = doc.loadFile(task.txt_file) else: import os try: txt_path = doc.getAssetsByTagName(ASSET_TAGS["TXT_JSON"])[0]["file_name"] txt = doc.loadFile(os.path.join(doc.base_path, txt_path)) except Exception as e: if DEBUG: print e if txt is None: print "TEXT FILE IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return import re # script from https://code.google.com/p/ebcode/ -> ebdata.tar.gz -> ebdata/nlp/addresses.py # Regex notes: # * This is *not* a case-insensitive regex, because we assume # capitalized words are special (street names). # * All data matched by capturing parentheses is concatenated together, so # if you don't want to include something in the resulting string, don't # capture it. # STREET_NAME is a fragment of a regular expression that is used in several # places in our "real" regular expression (ADDRESSES_RE) below. The one tricky # thing about it is that it includes a "CAPTURE_START" placeholder instead of # a capturing opening parenthesis. This lets us create two versions of the # regex -- STREET_NAME_CAPTURE and STREET_NAME_NOCAPTURE. STREET_NAME = r""" # Here, we define some common false positives and tell the regex to ignore them. (?! [Aa][Ss][Ss][Oo][Cc][Ii][Aa][Tt][Ee][Dd]\ [Pp][Rr][Ee][Ss][Ss] # associated press | [Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\ [Oo][Ff] # university of ) # DIRECTION %(CAPTURE_START)s (?: [NSEWnsew]\.? | (?: [Nn][Oo][Rr][Tt][Hh] | [Ss][Oo][Uu][Tt][Hh] | [Ee][Aa][Ss][Tt] | [Ww][Ee][Ss][Tt] | [Nn][Oo][Rr][Tt][Hh][Ee][Aa][Ss][Tt] | [Ee][Aa][Ss][Tt][Ww][Ee][Ss][Tt] | [Ss][Oo][Uu][Tt][Hh][Ee][Aa][Ss][Tt] | [Ss][Oo][Uu][Tt][Hh][Ww][Ee][Ss][Tt] ) | (?: N\.?W | S\.?W | N\.?E | S\.?E )\.? ) \ + # space (but not newline) )? (?: # STREET NAME %(CAPTURE_START)s # Numbered street names with a suffix ("3rd", "4th"). \d+(?:st|ST|nd|ND|rd|RD|th|TH|d|D) | # Or, numbered street names without a suffix ("3", "4") # but with a street type. \d+ (?= \ + (?:Ave|Avenue|Blvd|Boulevard|Bvd|Cir|Circle|Court|Ct|Dr|Drive| Lane|Ln|Parkway|Pkwy|Place|Plaza|Pl|Plz|Point|Pt|Pts|Rd|Rte| Sq|Sqs|Street|Streets|St|Sts|Terrace|Ter|Terr|Trl|Way|Wy ) \b ) | # Or, street names that don't start with numbers. (?: # Optional prefixes -- # "St", as in "St Louis" # "Dr. Martin", as in "Dr. Martin Luther King" (?: [Ss][Tt]\.? | [Dd][Rr]\.?\ [Mm][Aa][Rr][Tt][Ii][Nn] ) \ + )? (?: Mass\.(?=\ +[Aa]ve) # Special case: "Mass." abbr. for "Massachussetts Ave." # Needs to be special-cased because of the period. | (?:Avenue|Ave\.?)\ +[A-Z] # Special case: "Avenue X" | [A-Z][a-z][A-Za-z]* # One initial-capped word | [A-Z]\b # Single-letter street name (e.g., K St. in DC) (?!\.\w) # Avoid '20 U.S.A.' ) ) (?: # Here, we list the options with street suffixes first, so that # the suffix abbreviations are treated as the last part of the # street name, to avoid overeagerly capturing "123 Main St. The". %(CAPTURE_START)s \ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\. | \ +[A-Z][a-z][A-Za-z]*\ (?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\. | (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){2}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\. | (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){3}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\. | (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){4}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\. | (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){5}\ +(?:Ave|Blvd|Bvd|Cir|Ct|Dr|Ln|Pkwy|Pl|Plz|Pt|Pts|Rd|Rte|Sq|Sqs|St|Sts|Ter|Terr|Trl|Wy)\. | (?:,?\ Jr\.?,?|\ +[A-Z][a-z][A-Za-z]*){1,5} )? # OPTIONAL POST-DIR (?: # Standard post-dir format %(CAPTURE_START)s ,?\s(?:N\.?E|S\.?E|N\.?W|S\.?W|N|S|E|W)\.? ) # Avoid greedily capturing more letters, like # '123 Main St, New England' to '123 Main St, N' (?![A-Za-z]) | # Or, a special-case for DC quadrants, to find stuff like: # "600 H Street in NE Washington" # "600 H Street in the NE quadrant" # "600 H Street in northeast DC" # Note that this is NOT captured, so that it's excluded from # the final output. ,? \s in %(CAPTURE_START)s \s ) (?: (?:the|far) \s )? %(CAPTURE_START)s (?:NE|SE|NW|SW|[Nn]ortheast|[Ss]outheast|[Nn]orthwest|[Ss]outhwest) (?= \s (?:quadrant|D\.?C\.?|Washington) ) ) )? )? ) """ STREET_NAME_CAPTURE = STREET_NAME % {"CAPTURE_START": "("} STREET_NAME_NOCAPTURE = STREET_NAME % {"CAPTURE_START": "(?:"} ADDRESSES_RE = re.compile( r"""(?x) (?<!-|/|:|,|\.|\$) # These various characters are not allowed before an address/intersection. \b # Ignore things that look like dates -- e.g., "21 May 2009". # This is a problem e.g. in cases where there's a May Street. (?! \d+\s+ (?:January|February|March|April|May|June|July|August|September|October|November|December) ,?\s+ \d\d\d\d ) # Ignore intersections that are prefixed by "University of", like # "University of Texas at Austin". This is a common false positive. (?<! [Uu][Nn][Ii][Vv][Ee][Rr][Ss][Ii][Tt][Yy]\s[Oo][Ff]\s ) (?: # SEGMENT ("FOO BETWEEN BAR AND BAZ") (?: %(STREET_NAME_CAPTURE)s (,?\ + between \ +) %(STREET_NAME_CAPTURE)s (,?\ + and \ +) %(STREET_NAME_CAPTURE)s | %(STREET_NAME_CAPTURE)s (,?\ + from \ +) %(STREET_NAME_CAPTURE)s (,?\ + to \ +) %(STREET_NAME_CAPTURE)s ) | # BLOCK/ADDRESS (?: ( (?: (?:\d+|[Ff][Ii][Rr][Ss][Tt])[-\ ] (?:(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ )? [Bb][Ll][Oo][Cc][Kk]\ [Oo][Ff] | \d+\ *-\ *\d+ | \d+ ) \ + ) %(STREET_NAME_CAPTURE)s # ignore the intersection in parenthesis so that it's not picked # up as a separate location. We do this by consuming the string # but *not* capturing it. (?: \ + \(? between \ + %(STREET_NAME_NOCAPTURE)s \ + and \ + %(STREET_NAME_NOCAPTURE)s \)? )? ) | # INTERSECTION (?: # Common intersection prefixes. They're included here so that the # regex doesn't include them as part of the street name. (?: (?: [Nn]ear | [Aa]t | [Oo]n | [Tt]o | [Aa]round | [Ii]ntersection\ of | [Cc]orner\ of | [Aa]rea\ of | [Aa]reas?\ surrounding | vicinity\ of | ran\ down | running\ down | crossed ) \ + )? \b (?:%(STREET_NAME_CAPTURE)s) (\ +) ( (?: [Aa][Nn][Dd] | [Aa][Tt] | [Nn][Ee][Aa][Rr] | & | [Aa][Rr][Oo][Uu][Nn][Dd] | [Tt][Oo][Ww][Aa][Rr][Dd][Ss]? | [Oo][Ff][Ff] | (?:[Jj][Uu][Ss][Tt]\ )?(?:[Nn][Oo][Rr][Tt][Hh]|[Ss][Oo][Uu][Tt][Hh]|[Ee][Aa][Ss][Tt]|[Ww][Ee][Ss][Tt])\ [Oo][Ff] | (?:[Jj][Uu][Ss][Tt]\ )?[Pp][Aa][Ss][Tt] ) \ + ) (?:%(STREET_NAME_CAPTURE)s) ) ) # OPTIONAL CITY SUFFIX (?: (?: ,?\s+in | , ) \s+ # CITY NAME ( [A-Z][a-z][A-Za-z]* # One initial-capped word (?: ,?\ Jr\.?,? | \ [A-Z][a-z][A-Za-z]* | -[A-Za-z]+ # Hyphenated words (e.g. "Croton-on-Hudson" in NY) ){0,4} # Initial-capped words ) )? """ % {"STREET_NAME_CAPTURE": STREET_NAME_CAPTURE, "STREET_NAME_NOCAPTURE": STREET_NAME_NOCAPTURE} ) addresses = parse_addresses(txt, ADDRESSES_RE) if addresses is None: print "COULD NOT EXTRACT ADDRESSES." print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return asset_path = doc.addAsset( addresses, "addresses.json", as_literal=False, description="addresses output from Everyblock address extractor", tags=[ASSET_TAGS["ADDRESSES_NLP"], ASSET_TAGS["CP_ENTITIES"]], ) if asset_path is None or not doc.addFile(asset_path, None, sync=True): print "COULD NOT SAVE ASSET." print "\n\n************** %s [ERROR] ******************\n" % task_tag task.fail() return doc.addCompletedTask(task.task_path) task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag task.finish()
def generatePageMap(uv_task): task_tag = "PAGE MAPPER" print "\n\n************** %s [START] ******************\n" % task_tag print "MAPPING PAGES FROM TEXT DOCUMENT at %s" % uv_task.doc_id uv_task.setStatus(302) from lib.Worker.Models.uv_document import UnveillanceDocument from conf import DEBUG from vars import ASSET_TAGS doc = UnveillanceDocument(_id=uv_task.doc_id) if doc is None: print "DOC IS NONE" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return import os, json try: page_path = doc.getAssetsByTagName(ASSET_TAGS['TXT_JSON'])[0]['file_name'] pages = json.loads(doc.loadFile(os.path.join(doc.base_path, page_path))) except Exception as e: if DEBUG: print e try: bow_path = doc.getAssetsByTagName(ASSET_TAGS['BOW'])[0]['file_name'] bow = json.loads(doc.loadFile(os.path.join(doc.base_path, bow_path))) except Exception as e: if DEBUG: print e if pages is None or bow is None: print "NO PAGES OR BAG OF WORDS" print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return # with unique words in bag that are not stopwords # for each page, word count of each from numpy import intersect1d, setdiff1d from conf import getConfig if hasattr(uv_task, "stopwords"): stopwords = uv_task.stopwords else: stopwords = os.path.join(getConfig('nlp_server.path'), "stopwords.json") try: with open(stopwords, 'rb') as S: if hasattr(uv_task, "stopwords_lang"): lang = uv_task.stopwords_lang else: lang = "english" stopwords = json.loads(S.read())[lang] except Exception as e: print "NO STOPWORDS...\n%s" % e print "\n\n************** %s [WARN] ******************\n" % task_tag page_map = [] print "STOPWORDS: (len %d)\nTOP:\n%s\n" % (len(stopwords), stopwords[:10]) print "BAG OF WORDS: (len %d)\nTOP:\n%s\n" % (len(bow), bow[:10]) use_words = [w for w in setdiff1d(bow, stopwords).tolist() if len(w) > 1] print "SIFTING BAG OF WORDS (old len: %d, new len: %d)" % (len(bow), len(use_words)) global_info = {} for i, p in enumerate(pages): if p is None: continue page_bow = p.lower().split(" ") words = intersect1d(use_words, page_bow).tolist() if len(words) == 0: continue map = [] frequency_max = 0 for word in words: word_info = { 'word' : word, 'count' : page_bow.count(word) } map.append(word_info) if word_info['count'] > frequency_max: frequency_max = word_info['count'] if word not in global_info.keys(): global_info[word] = 0 global_info[word] += word_info['count'] page_map.append({ 'index' : i, 'map' : map, 'frequency_max' : frequency_max }) if len(page_map) > 0: global_info['uv_page_map'] = page_map asset_path = doc.addAsset(global_info, "page_map.json", as_literal=False, description="word frequencies, page-by-page", tags=[ASSET_TAGS['PAGE_MAP']]) if asset_path is None or not doc.addFile(asset_path, None, sync=True): print "COULD NOT SAVE ASSET." print "\n\n************** %s [ERROR] ******************\n" % task_tag uv_task.fail() return doc.addCompletedTask(uv_task.task_path) uv_task.routeNext() print "\n\n************** %s [END] ******************\n" % task_tag uv_task.finish()