from util import init_spark, init_spark_tagger, init_spark_tokenizer, init_spark_sentencizer, process_wiki_json tokenize = lambda x: x.split() COUNT_THRESHOLD = 2 NPMI_THRESHOLD = 0.75 parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('vocab', type=str, help='Path to vocabulary') parser.add_argument('corpus', type=str, help='Path to corpus') parser.add_argument('output', type=str, help='Path to output') parser.add_argument('ngram', type=int, help='Length of ngram') parser.add_argument('subsample', type=float, help='Subsample amount') args = parser.parse_args() spark, sc = init_spark() tokenize = init_spark_tokenizer(sc) tag_rus = init_spark_tagger(sc, 'rus') sent_tokenize = init_spark_sentencizer(sc, 'rus') vocabulary_location = args.vocab corpus_location = args.corpus ngram_len = args.ngram subsample_factor = 1. - args.subsample corpus = sc.textFile(corpus_location) vocabulary_file = sc.textFile(vocabulary_location) def valid_n_gram(ngram_count_npmi):
n_students = x[0] return((n_students , x[1] * n_students, x[2] * n_students, x[3] * n_students , 1 )) def avg(x): n_students = x[0] if n_students ==0: return(0,0,0) return( x[1] / n_students , x[2] / n_students , x[3] / n_students ) if __name__ == '__main__': sc, sqlContext = init_spark(verbose_logging=True) sat = read_hdfs_csv(sqlContext, '/user/zz1409/sat_scores2012.csv') loc = read_hdfs_csv(sqlContext, '/user/zz1409/loc_clean.csv') sat= sat.map(lambda x: (x[1].lower(), (x[2],x[3],x[4] ,x[5])) ) loc = loc.select( 'school_name' , 'zip').map(lambda x: (x[0],x[1])) df = sat.join(loc).map( lambda x: (x[1][1] , total_score( x[1][0] ) )) df = df.reduceByKey( lambda x,y: [ x+y for x,y in zip(x,y) ] ) df = df.map(lambda x: ( x[0] , avg(x[1]) )) df = df.map(lambda x: (x[0], x[1][0],x[1][1],x[1][2] )) df = df.toDF(['zip', 'reading','math','writing']) df.toPandas().to_csv('mycsv.csv')
def parse_ints(s): l = s.split(',') r = [] for x in l: if x.find('-') != -1: a, b = x.split('-') r.extend(range(int(a), int(b) + 1)) else: r.append(int(x)) return r if __name__ == '__main__': sc, sqlContext = init_spark() sc.addPyFile('util.py') rows = read_hdfs_csv(sqlContext, dbname) rows_infer = read_hdfs_csv(sqlContext, dbname, inferschema=True) id_ = rows.select('CMPLNT_NUM') fields = parse_ints(sys.argv[1]) # CMPLNT_NUM if 1 in fields: df = handle_cmplnt_num(rows, rows_infer) summarize(df, 'CMPLNT_NUM') dump_info(df, 'CMPLNT_NUM') id_ = id_.intersect( df.where(df.CMPLNT_NUM_valid == 'VALID').select('CMPLNT_NUM'))