conf = SparkConf().setAppName("Entity Recognition").setMaster("local[*]") sc = SparkContext( conf=conf, serializer=PickleSerializer(), # Default serializer # Unlimited batch size -> BatchedSerializer instead of AutoBatchedSerializer batchSize=64) st = StanfordNERTagger(stanford + '/classifiers/english.all.3class.distsim.crf.ser.gz', stanford + '/stanford-ner.jar', encoding='utf-8') rdd_whole_warc_file = rdd = sc.newAPIHadoopFile( in_file, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text", conf={"textinputformat.record.delimiter": "WARC/1.0"}) rdd_html_cleaned = rdd_whole_warc_file.flatMap( lambda x: decode(x, record_attribute)) print("step 2") # Extract named Entities candidate_entities = rdd_html_cleaned.map( lambda x: get_candidate_entities(x, st)) # stanford_rdd = rdd_html_cleaned.map(lambda x: ner_spacy(x)) print(candidate_entities.collect()) #print(stanford_rdd.collect())
parser.add_argument("-X", "--mode", help="train|inference", default="train") # 模式 train表示 训练;inference 表示 推理 parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) # RDMA 模式 远程直接数据存取 parser.add_argument("-md", "--model_name", help="The model name",type=str,default="model.ckpt") # parser.add_argument("-md2", "--model_name2", help="The model name", type=str,default="model2.ckpt") parser.add_argument("-a", "--acc", help="Precision threshold", type=float,default=0.5) parser.add_argument("-dr", "--dropout", help="Retention rate", type=float,default=0.5) parser.add_argument("-lr", "--learning_rate", help="learning rate", type=float,default=1e-6) args = parser.parse_args() print("args:",args) print("{0} ===== Start".format(datetime.now().isoformat())) if args.format == "tfr": # HDFS==>numpy array images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable") def toNumpy(bytestr): example = tf.train.Example() example.ParseFromString(bytestr) features = example.features.feature image = numpy.array(features['image'].int64_list.value) label = numpy.array(features['label'].int64_list.value) return (image, label) dataRDD = images.map(lambda x: toNumpy(str(x[0]))) else: if args.format == "csv": # HDFS==>numpy array images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]) labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) else: # args.format == "pickle": # HDFS==>numpy array images = sc.pickleFile(args.images)
parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/inference", default="mnist_model") parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions") parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1) parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000) parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("-X", "--mode", help="train|inference", default="train") parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) args = parser.parse_args() print("args:",args) print("{0} ===== Start".format(datetime.now().isoformat())) if args.format == "tfr": images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable") def toNumpy(bytestr): example = tf.train.Example() example.ParseFromString(bytestr) features = example.features.feature image = numpy.array(features['image'].int64_list.value) label = numpy.array(features['label'].int64_list.value) return (image, label) dataRDD = images.map(lambda x: toNumpy(str(x[0]))) else: if args.format == "csv": images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]) labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) else: # args.format == "pickle": images = sc.pickleFile(args.images)