Python SparkContext.newAPIHadoopFile示例

conf = SparkConf().setAppName("Entity Recognition").setMaster("local[*]")
sc = SparkContext(
    conf=conf,
    serializer=PickleSerializer(),  # Default serializer
    # Unlimited batch size -> BatchedSerializer instead of AutoBatchedSerializer
    batchSize=64)

st = StanfordNERTagger(stanford +
                       '/classifiers/english.all.3class.distsim.crf.ser.gz',
                       stanford + '/stanford-ner.jar',
                       encoding='utf-8')

rdd_whole_warc_file = rdd = sc.newAPIHadoopFile(
    in_file,
    "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
    "org.apache.hadoop.io.LongWritable",
    "org.apache.hadoop.io.Text",
    conf={"textinputformat.record.delimiter": "WARC/1.0"})

rdd_html_cleaned = rdd_whole_warc_file.flatMap(
    lambda x: decode(x, record_attribute))

print("step 2")

# Extract named Entities
candidate_entities = rdd_html_cleaned.map(
    lambda x: get_candidate_entities(x, st))
# stanford_rdd = rdd_html_cleaned.map(lambda x: ner_spacy(x))
print(candidate_entities.collect())

#print(stanford_rdd.collect())

示例#2

显示文件

文件： mnist_spark.py 项目： wucng/Tensorflow

parser.add_argument("-X", "--mode", help="train|inference", default="train") # 模式 train表示 训练；inference 表示 推理
parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) # RDMA 模式 远程直接数据存取

parser.add_argument("-md", "--model_name", help="The model name",type=str,default="model.ckpt")
# parser.add_argument("-md2", "--model_name2", help="The model name", type=str,default="model2.ckpt")
parser.add_argument("-a", "--acc", help="Precision threshold", type=float,default=0.5)
parser.add_argument("-dr", "--dropout", help="Retention rate", type=float,default=0.5)
parser.add_argument("-lr", "--learning_rate", help="learning rate", type=float,default=1e-6)
args = parser.parse_args()
print("args:",args)

print("{0} ===== Start".format(datetime.now().isoformat()))

if args.format == "tfr":  # HDFS==>numpy array
  images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
                              keyClass="org.apache.hadoop.io.BytesWritable",
                              valueClass="org.apache.hadoop.io.NullWritable")
  def toNumpy(bytestr):
    example = tf.train.Example()
    example.ParseFromString(bytestr)
    features = example.features.feature
    image = numpy.array(features['image'].int64_list.value)
    label = numpy.array(features['label'].int64_list.value)
    return (image, label)
  dataRDD = images.map(lambda x: toNumpy(str(x[0])))
else:
  if args.format == "csv": # HDFS==>numpy array
    images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])
    labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
  else: # args.format == "pickle":  # HDFS==>numpy array
    images = sc.pickleFile(args.images)

示例#3

显示文件

文件： mnist_spark.py 项目： Aravindreddy986/TensorFlowOnSpark

parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/inference", default="mnist_model")
parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors)
parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions")
parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1)
parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000)
parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true")
parser.add_argument("-X", "--mode", help="train|inference", default="train")
parser.add_argument("-c", "--rdma", help="use rdma connection", default=False)
args = parser.parse_args()
print("args:",args)

print("{0} ===== Start".format(datetime.now().isoformat()))

if args.format == "tfr":
  images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
                              keyClass="org.apache.hadoop.io.BytesWritable",
                              valueClass="org.apache.hadoop.io.NullWritable")
  def toNumpy(bytestr):
    example = tf.train.Example()
    example.ParseFromString(bytestr)
    features = example.features.feature
    image = numpy.array(features['image'].int64_list.value)
    label = numpy.array(features['label'].int64_list.value)
    return (image, label)
  dataRDD = images.map(lambda x: toNumpy(str(x[0])))
else:
  if args.format == "csv":
    images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])
    labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
  else: # args.format == "pickle":
    images = sc.pickleFile(args.images)