from util import init_spark, init_spark_tagger, init_spark_tokenizer, init_spark_sentencizer, process_wiki_json
tokenize = lambda x: x.split()

COUNT_THRESHOLD = 2
NPMI_THRESHOLD = 0.75

parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('vocab', type=str, help='Path to vocabulary')
parser.add_argument('corpus', type=str, help='Path to corpus')
parser.add_argument('output', type=str, help='Path to output')
parser.add_argument('ngram', type=int, help='Length of ngram')
parser.add_argument('subsample', type=float, help='Subsample amount')

args = parser.parse_args()

spark, sc = init_spark()
tokenize = init_spark_tokenizer(sc)
tag_rus = init_spark_tagger(sc, 'rus')
sent_tokenize = init_spark_sentencizer(sc, 'rus')

vocabulary_location = args.vocab
corpus_location = args.corpus
ngram_len = args.ngram
subsample_factor = 1. - args.subsample

corpus = sc.textFile(corpus_location)
vocabulary_file = sc.textFile(vocabulary_location)


def valid_n_gram(ngram_count_npmi):
示例#2
0
    n_students = x[0]
    
    return((n_students , x[1] * n_students, x[2] * n_students, x[3] * n_students , 1 ))

def avg(x):

    n_students = x[0]

    if n_students ==0:
        return(0,0,0)

    return( x[1] / n_students , x[2] / n_students , x[3] / n_students  )

if __name__ == '__main__':
        
    sc, sqlContext = init_spark(verbose_logging=True)

    sat = read_hdfs_csv(sqlContext, '/user/zz1409/sat_scores2012.csv')
    loc = read_hdfs_csv(sqlContext, '/user/zz1409/loc_clean.csv')

    sat= sat.map(lambda x: (x[1].lower(), (x[2],x[3],x[4] ,x[5])) )

    loc = loc.select( 'school_name' , 'zip').map(lambda x: (x[0],x[1]))

    df = sat.join(loc).map( lambda x: (x[1][1] , total_score( x[1][0]  )   ))
    df = df.reduceByKey( lambda x,y: [ x+y for x,y in zip(x,y) ] )
    df = df.map(lambda x: ( x[0] , avg(x[1]) ))
    df = df.map(lambda x: (x[0], x[1][0],x[1][1],x[1][2]  ))

    df = df.toDF(['zip', 'reading','math','writing'])
    df.toPandas().to_csv('mycsv.csv')

def parse_ints(s):
    l = s.split(',')
    r = []
    for x in l:
        if x.find('-') != -1:
            a, b = x.split('-')
            r.extend(range(int(a), int(b) + 1))
        else:
            r.append(int(x))
    return r


if __name__ == '__main__':
    sc, sqlContext = init_spark()
    sc.addPyFile('util.py')

    rows = read_hdfs_csv(sqlContext, dbname)
    rows_infer = read_hdfs_csv(sqlContext, dbname, inferschema=True)
    id_ = rows.select('CMPLNT_NUM')

    fields = parse_ints(sys.argv[1])

    # CMPLNT_NUM
    if 1 in fields:
        df = handle_cmplnt_num(rows, rows_infer)
        summarize(df, 'CMPLNT_NUM')
        dump_info(df, 'CMPLNT_NUM')
        id_ = id_.intersect(
            df.where(df.CMPLNT_NUM_valid == 'VALID').select('CMPLNT_NUM'))