def driver(sc, inputFilename, outputDirectory, crfExecutable, crfScript, featureListFilename, crfModelFilename, eyeColorRef, eyeColorConfig, hairRef, hairConfig, limit=limit, location='hdfs', outputFormat="text", partitions=None): dump = False partitions = None # Program to compute CRF++ c = crf_features.CrfFeatures(featureListFilename) # Add files to be downloaded with this Spark job on every node. sc.addFile(crfExecutable) sc.addFile(crfScript) sc.addFile(crfModelFilename) # Map to reference sets smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig) smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig) if location == "hdfs": print "We want to do hdfs dfs -rm -r %s" % outputDirectory elif location == "local": try: shutil.rmtree(outputDirectory) print "rmtree %s" % outputDirectory except: pass else: raise RuntimeError("No such location: %s" % location) rdd_sequence_file_input = sc.sequenceFile(inputFilename) rdd_sequence_file_input.setName('rdd_sequence_file_input') # rdd_sequence_file_input.persist() origSize = rdd_sequence_file_input.count() # if limit: # rdd = sc.parallelize(rdd_sequence_file_input.take(limit)) if partitions: rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions) print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions()) rdd_json = rdd_sequence_file_input.mapValues(lambda x: json.loads(x)) rdd_json.setName('rdd_json') # rdd_json.persist() # all below should also be done for title rdd_body = rdd_json.mapValues(lambda x: extract_body(x)) rdd_body.setName('rdd_body') # rdd_body.persist() if dump: rdd_body.saveAsTextFile(ff("body")) rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x)) rdd_body_tokens.setName('rdd_body_tokens') # rdd_body_tokens.persist() if dump: rdd_body_tokens.saveAsTextFile(ff("body_tokens")) rdd_features = rdd_body_tokens.map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True))) rdd_features.setName('rdd_features') # rdd_features.persist() if dump: rdd_features.saveAsTextFile(ff("features")) # rdd_pipeinput = rdd_features.mapValues(lambda x: base64.b64encode(vectorToString(x))) rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToString(x)) rdd_pipeinput.setName('rdd_pipeinput') # rdd_pipeinput.persist() if dump: rdd_pipeinput.values().saveAsTextFile(ff("pi")) # This caused a cannot concatenate string + None error # rdd_pipeinput.saveAsTextFile(outputDirectory + "-pipeinput") # DON'T USE SparkFiles.get to fetch the crf_test or model # This only works with local Spark (--master local[*]) if location == 'hdfs': cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename)) elif location == 'local': cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename))) print "### %s" % cmd rdd_pipeinput.saveAsTextFile(ff("before")) exit(0) rdd_crf_b64 = rdd_pipeinput.values().pipe(cmd) rdd_crf_b64.setName('rdd_crf_b64') # rdd_crf_b64.persist() if dump: rdd_crf_b64.saveAsTextFile(ff("po")) # Go directly from base64 output to a reconstructed tuple format mapping URI to vector of vectors, # with empty string suffix indicating blank line # This is key for avoiding the groupBy step rdd_restore = rdd_crf_b64.map(lambda x: restore(x)) rdd_restore.setName('rdd_restore') # rdd_restore.persist() if dump: rdd_restore.saveAsTextFile(ff("restore")) # ### WE NO LONGER HAVE TO GROUPBY # ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW # rdd_withuri = sc.parallelize(rdd_withuri.take(10)) rdd_harvested = rdd_restore.mapValues(lambda x: computeSpans(x, indexed=True)).filter(lambda p: p[1]) rdd_harvested.setName('rdd_harvested') # rdd_harvested.persist() if dump: rdd_harvested.saveAsTextFile(ff("harvested")) # This has the effect of generating 0, 1, 2, ... lines according to the number of spans rdd_controlled = rdd_harvested.flatMapValues(lambda x: list(x)) rdd_controlled.setName('rdd_controlled') # rdd_controlled.persist() # map any eyeColor spans using smEyeColor, hairType spans using smHairColor rdd_aligned = rdd_controlled.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor})) rdd_aligned.setName('rdd_aligned') # rdd_aligned.persist() if dump: rdd_aligned.saveAsTextFile(ff("aligned")) rdd_aligned_json = rdd_aligned.mapValues(lambda x: json.dumps(x)) rdd_aligned_json.setName('rdd_aligned_json') # rdd_aligned_json.persist() if dump: rdd_aligned_json.saveAsTextFile(ff("aligned_json")) rdd_final = rdd_aligned_json empty = rdd_final.isEmpty() if not empty: l = "unknown>1" print "### writing %s output (%s records) to %s" % (outputFormat, l, outputDirectory) # print len(rdd_final.collect()) if outputFormat == "sequence": rdd_final.saveAsSequenceFile(outputDirectory) elif outputFormat == "text": rdd_final.saveAsTextFile(outputDirectory) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat) else: print "### No records: no output into %s" % (outputDirectory)
rdd_body = rdd_json.mapValues(lambda x: extract_body(x)) rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x)) # TBD # rdd_title = rdd_json.mapValues(lambda x: extract_title(x)) # rdd_title_tokens = rdd.title.mapValues(lambda x: textTokens(x)) # all below should also be done for title # not a pair RDD? rdd_features = rdd_body_tokens.map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True))) rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToString(x)) cmd = SparkFiles.get("crf_test") + " -m " + SparkFiles.get(crfModelFilename) rdd_crf = rdd_pipeinput.values().pipe(cmd) # not a pair RDD # but we have the URI in the -3 position # and the index in the -2 position rdd_withuri = rdd_crf.map(lambda x: reconstructTuple(x)) rdd_grouped = rdd_withuri.groupByKey() rdd_flat = rdd_grouped.mapValues(lambda x: [l[1:] for l in sorted(x, key=lambda r: int(r[0]))]) rdd_harvested = rdd_flat.mapValues(lambda x: computeSpans(x, indexed=True)) # This has the effect of generating 0, 1, 2, ... lines according to the number of spans rdd_controlled = rdd_harvested.flatMapValues(lambda x: list(x)) # map any eyeColor spans using smEye, hairType spans using smHair rdd_aligned = rdd_controlled.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEye, "hairType": smHair})) rdd_final = rdd_aligned rdd_final.saveAsTextFile(outputDirectory)
def crfalign(sc, inputFilename, outputDirectory, limit=LIMIT, location='hdfs', outputFormat="text", partitions=None, deleteFirst=True): crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config") def cpath(n): return os.path.join(crfConfigDir, n) smEyeColor = HybridJaccard(ref_path=cpath("eyeColor_reference_wiki.txt"), config_path=cpath("eyeColor_config.txt")) smHairColor = HybridJaccard(ref_path=cpath("hairColor_reference_wiki.txt"), config_path=cpath("hairColor_config.txt")) print smEyeColor, smHairColor if location == "hdfs": if deleteFirst: namenode = "memex-nn1" port = 8020 client = Client(namenode, 8020, use_trash=True) try: for deleted in client.delete([outputDirectory], recurse=True): print deleted except FileNotFoundException as e: pass # hypothesis1: data fetched this way prompts the lzo compression error # hypothesis2: but it doesn't matter, error is just a warning if partitions: if limit: rdd_crfl = sc.parallelize(rdd_crfl.take(limit)) rdd_crfl = rdd_crfl.repartition(partitions) else: print inputFilename rdd_crfl = sc.textFile(inputFilename, minPartitions=partitions) else: rdd_crfl = sc.textFile(inputFilename) rdd_crfl.setName('rdd_crfl') # rdd_crfl.persist() print "beginning: %s partitions" % rdd_crfl.getNumPartitions() # "value-only" RDD, not a pair RDD # but we have the URI in the -3 position # and the index in the -2 position rdd_withuri = rdd_crfl.map(lambda x: reconstructTuple(x)) # Note: groupByKey returns iterable, not data; so no point in printing rdd_grouped = rdd_withuri.groupByKey() # sort the vectors by index (within key groups) rdd_sorted = rdd_grouped.mapValues(lambda x: [l[1:] for l in sorted(x, key=lambda r: int(r[0]))]) # find all contiguous spans of marked-up tokens # returns 0 or more dicts per URI key rdd_spans = rdd_sorted.mapValues(lambda x: computeSpans(x, indexed=True)) # flatten to (URI, single dict) on each line rdd_flat = rdd_spans.flatMapValues(lambda x: list(x)) # rdd_flat = rdd_flat.coalesce(rdd_flat.getNumPartitions() / 3) # # map any eyeColor spans using smEyeColor, hairType spans using smHairColor # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor})) rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor.findBestMatch, "hairType": smHairColor.findBestMatch})) # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": fakeFindBestMatch, "hairType": fakeFindBestMatch})) # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {})) # rdd_aligned = rdd_spans # rdd_final = rdd_crfl rdd_final = rdd_aligned print outputFormat if outputFormat == "sequence": rdd_final.saveAsSequenceFile(outputDirectory) elif outputFormat == "text": print "saving to %s" % outputDirectory rdd_final.saveAsTextFile(outputDirectory) else: raise RuntimeError("Unrecognized output format: %s" % outputFormat)
def driver(sc, inputFilename, outputDirectory, crfExecutable, crfScript, featureListFilename, crfModelFilename, eyeColorRef, eyeColorConfig, hairRef, hairConfig, limit=limit, location='hdfs', outputFormat="text", partitions=None): dump = False partitions = 8 # Program to compute CRF++ c = crf_features.CrfFeatures(featureListFilename) # Add files to be downloaded with this Spark job on every node. sc.addFile(crfExecutable) sc.addFile(crfScript) sc.addFile(crfModelFilename) # Map to reference sets smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig) smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig) if location == "hdfs": print "We want to do hdfs dfs -rm -r %s" % outputDirectory elif location == "local": try: shutil.rmtree(outputDirectory) print "rmtree %s" % outputDirectory except: pass else: raise RuntimeError("No such location: %s" % location) rdd_sequence_file_input = sc.sequenceFile(inputFilename) rdd_sequence_file_input.setName('rdd_sequence_file_input') # rdd_sequence_file_input.persist() origSize = rdd_sequence_file_input.count() # if limit: # rdd = sc.parallelize(rdd_sequence_file_input.take(limit)) if partitions: rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions) print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions()) if location == 'hdfs': cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename)) elif location == 'local': cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename))) print "### %s" % cmd # ### WE NO LONGER HAVE TO GROUPBY # ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW # rdd_withuri = sc.parallelize(rdd_withuri.take(10)) rdd_final = rdd_sequence_file_input.mapValues(lambda x: json.loads(x)).mapValues(lambda x: extract_body(x)).mapValues(lambda x: textTokens(x)).map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True))).mapValues(lambda x: base64.b64encode(vectorToString(x))).values().pipe(cmd).map(lambda x: restore(x)).mapValues(lambda x: computeSpans(x, indexed=True)).filter(lambda p: p[1]).flatMapValues(lambda x: list(x)).mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor})).mapValues(lambda x: json.dumps(x)) empty = rdd_final.isEmpty() if not empty: l = "unknown>1" print "### writing %s output (%s records) to %s" % (outputFormat, l, outputDirectory) print len(rdd_final.collect()) # if outputFormat == "sequence": # rdd_final.saveAsSequenceFile(outputDirectory) # elif outputFormat == "text": # rdd_final.saveAsTextFile(outputDirectory) # else: # raise RuntimeError("Unrecognized output format: %s" % outputFormat) else: print "### No records: no output into %s" % (outputDirectory)