Exemplo n.º 1
0
    def create_VP_tables(self):
        print "Beginning the creation of VP tables."
        total_properties = len(self.properties)
        i = 0
        # for each distinct property, create a table
        for p in self.properties:
            i += 1
            prop_df = self.sqlContext.sql(
                "SELECT s AS s, o AS o FROM tripletable WHERE p='" + p + "'")
            df_writer = DataFrameWriter(prop_df)
            df_writer.saveAsTable("VP_" + valid_string(p))
            sys.stdout.write("\rTables created: %d / %d " %
                             (i, total_properties))

        # if statistics are enabled, compute them
        if self.statsEnabled:
            i = 0
            stat = Stats()
            for p in self.properties:
                i += 1
                tableDF = self.sqlContext.sql("SELECT * FROM VP_" +
                                              valid_string(p))
                stat.addTableStat(p, tableDF)
                sys.stdout.write("\rStatistics created: %d / %d " %
                                 (i, total_properties))
            with open(self.statsFile, "w") as f:
                f.write(stat.getSerializedStats())
        print "Statistics created: %d / %d " % (i, total_properties)
Exemplo n.º 2
0
def download_file_from_one_drive(URL, destination):
    import requests
    URL = URL
    session = requests.Session()
    response = session.get(URL, stream=True)
    token = None
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            token = value
    if token:
        params = {'confirm': token}
        response = session.get(URL, params=params, stream=True)
    CHUNK_SIZE = 32768
    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:  # filter out keep-
                f.write(chunk)
Exemplo n.º 3
0
def download_file_from_google_drive(id, destination):
    import requests
    URL = "https://docs.google.com/uc?export=download"
    session = requests.Session()
    response = session.get(URL, params={'id': id}, stream=True)
    token = None
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            token = value
    if token:
        params = {'id': id, 'confirm': token}
        response = session.get(URL, params=params, stream=True)
    CHUNK_SIZE = 32768
    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)
def get_game_list(user_id):
    base_url = 'http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/'
    keys = [key1, key2]
    curr_keyid = 0
    params = {
            'key' : keys[curr_keyid],
            'steamid' : user_id.strip(),
            'format' : 'json' }
    
    games = None
    for i in range(3):
        try:
            games = requests.get(base_url, params = params).json().get('response').get('games')
            break
        except:
            try:
                curr_keyid = 1 - curr_keyid
                params.update({'key' : keys[curr_keyid]})
                games = requests.get(base_url, params = params).json().get('response').get('games')
                break
            except:
                time.sleep(5)
                pass
    
    if games and len(games) > 0:
        gamelist = []
        f = open('userinfo', 'a')
        for g in games:
            gamelist.append(g)
            g.update({'userid' : int(user_id.strip())})
            f.write(json.dumps(g))
            f.write('\n')
            userid = int(user_id.strip())
            appid = g.get('appid')
            playtime_forever = g.get('playtime_forever')
            spark.sql("INSERT INTO userinfo ('%s', '%s', '%s')" % (userid, appid, playtime_forever))
        return gamelist
)

display(dfOutput)

# COMMAND ----------

import pyspark.sql.functions as f

# Concat all text for the same file with a space " "
dfOutputPerFile = dfOutput \
  .groupby(dfOutput.filename) \
  .agg(f.concat_ws(" ", f.collect_list(dfOutput.text)) \
       .alias("text"))

display(dfOutputPerFile)

# COMMAND ----------

import pathlib

outputDir = f"/dbfs/mnt/{dbfs_mount_name}/output"
pathlib.Path(outputDir).mkdir(parents = True, exist_ok = True)

for row in dfOutputPerFile.collect():
  with open(f"{outputDir}/{row.filename}", "w") as f:
    f.write(row.text)

# COMMAND ----------


Exemplo n.º 6
0
def training(sparkSession, arguments, logger):

    # Get the input file path
    inputPath = arguments['--input']
    logger.info("...Starting training...")
    logger.info("Loading data from: {0}".format(inputPath))

    # Read the input dataset
    trainDF = sparkSession.read.parquet(inputPath)
    # Preprocess the data
    processData = preprocess(trainDF, logger)

    # Select final columns for training the algorithms
    processData = processData.select(processData.tdur.cast(FloatType()),
                                     processData.sport.cast(IntegerType()),
                                     processData.dport.cast(IntegerType()),
                                     processData.flag_onehot0,
                                     processData.flag_onehot1,
                                     processData.flag_onehot2,
                                     processData.flag_onehot3,
                                     processData.flag_onehot4,
                                     processData.flag_onehot5,
                                     processData.proto_onehot0,
                                     processData.proto_onehot1,
                                     processData.proto_onehot2,
                                     processData.proto_onehot3,
                                     processData.proto_onehot4,
                                     processData.ipkt.cast(FloatType()),
                                     processData.ibyt.cast(FloatType()),
                                     processData.opkt.cast(FloatType()),
                                     processData.obyt.cast(FloatType()),
                                     processData.nconnections.cast(IntegerType()))
    ## Normalize the data
    # Initialize a dictionary for the minimum and the maximum values for each normalized feature
    min_max = {}
    min_max['inputFeatures'] = []

    [dataNor, min_max] = NormalizeValues(processData, arguments, min_max, logger)

    # Save the minimum-maximum json file in the local filesystem
    with open(minMaxFile,"w") as f:
        f.write(json.dumps(min_max))

    # Transform Spark Data Frame into Pandas Data Frame
    dataNor = dataNor.toPandas()
    # Get the values to train the network
    dataTrain = dataNor.values

    # Select the training algorithm
    if arguments['OneClassSVM'] == True:
        nu = arguments['<nu>']
        kernel = arguments['<kernel>']
        alg = svm.OneClassSVM(kernel=kernel, nu=nu)
        algorithm = "OneClassSVM"
        logger.info("One Class SVM model")
    elif arguments['IsolationForest'] == True:
        estimators = arguments['<estimator>']
        contamination = arguments['<contam>']
        logger.info("Estimators: {0}; Contamination: {1}".format(estimators, contamination))
        alg = IsolationForest(n_estimators=estimators, contamination=contamination)
        algorithm = "IsolationForest"
        logger.info("Isolation model")
    elif arguments['LocalOutlier'] == True:
        neighbors = arguments['<neigh>']
        contamination = arguments['<contam>']
        alg = LocalOutlierFactor(n_neighbors=neighbors, contamination=contamination)
        algorithm = "LocalOutlier"
        logger.info("Local Outlier Factor model")

    logger.info("Fitting the network")
    # Training the network with the data
    alg.fit(dataTrain)
    logger.info("Algorithm has been trained.")

    algFile = algorithm + "_network.plk"
    # Copy the trained network to the network_trained parameter
    joblib.dump(alg, algFile)

    # Get the path to save the results
    hdfsTrainDir = arguments['--output']
    logger.info("Saving results to {0}".format(hdfsTrainDir))

    # Get HDFS structures
    path = sparkSession.sparkContext._gateway.jvm.org.apache.hadoop.fs.Path
    fileSystem = sparkSession.sparkContext._gateway.jvm.org.apache.hadoop.fs.FileSystem
    hadoopConfiguration = sparkSession.sparkContext._gateway.jvm.org.apache.hadoop.conf.Configuration
    fs = fileSystem.get(hadoopConfiguration())

    hdfsAlgPath = hdfsTrainDir + "/algorithms/" + algorithm + "_network.plk"
    hdfsMinMaxPath = hdfsTrainDir + "/" +  minMaxFile
    if(fs.exists(path(hdfsAlgPath)) == True):
        logger.warn("It already exists a trained network for the {0} algorithm in {1}".format(algorithm, hdfsTrainDir))
        logger.warn("The file is going to be overrited.")
        try:
          fs.delete(path(hdfsAlgPath), False)
        except:
          logger.error("Couldn't delete the network file")

    if(fs.exists(path(hdfsMinMaxPath)) == True):
        logger.warn("It already exists a file with the minimum and maximum values in {0}".format(hdfsTrainDir))
        logger.warn("The file is going to be overrited.")
        try:
          fs.delete(path(hdfsMinMaxPath), False)
        except:
          logger.error("Couldn't delete the minimum and maximum file")

    try:
         srcAlgFile = path(algFile)
         dstAlgFile = path(hdfsAlgPath)
         fs.moveFromLocalFile(srcAlgFile, dstAlgFile)
         logger.info("Training model exported correctly.")
    except:
         logger.error("Couldn't save the network in the file system.")

    try:
         srcMinMaxFile = path(minMaxFile)
         dstMinMaxFile = path(hdfsMinMaxPath)
         fs.moveFromLocalFile(srcMinMaxFile, dstMinMaxFile)
         logger.info("Minimum and maximum features file exported correctly.")
    except:
         logger.error("Couldn't save the minimum and maximum file in the file system.")

    logger.info("..Training has finished..")
Exemplo n.º 7
0
    g = GraphFrame(vertices_df, edges_df)
    result = g.labelPropagation(maxIter=5)

    # Get result and sort
    result_rdd = result.select("id", "label").rdd
    ans = result_rdd \
        .map(lambda line: (line.label, line.id)) \
        .groupByKey() \
        .map(lambda line: (len(line[1]), [str(i) for i in line[1]]))\
        .groupByKey()\
        .flatMap(lambda line: sorted([sorted(i) for i in line[1]], key=lambda x: x[0]))\
        .collect()

    # Output as txt file
    with open(community_output_file_path, 'w') as f:
        for line in ans:
            for i in range(len(line)):
                user = line[i]
                if i != len(line) - 1:
                    f.write('\'' + str(user) + '\', ')
                else:
                    f.write('\'' + str(user) + '\'')
            f.write('\n')
    f.close()

    # Finish time
    timer(start)

# spark-submit --packages graphframes:graphframes:0.6.0-spark2.3-s_2.11 task1.py 7 ub_sample_data.csv task1_ans
# spark-submit --packages graphframes:graphframes:0.6.0-spark2.3-s_2.11 task1.py 7 $ASNLIB/publicdata/ub_sample_data.csv task1_ans
userdf = spark.createDataFrame(nodes, StringType()).selectExpr("value as id")

#print(filterPhaseA.count())
#print(nodes.count())

g = GraphFrame(userdf, edgedf)
result = g.labelPropagation(maxIter=5)
community = result.groupby("label").agg(f.collect_list("id").alias("id"))
fin = community.select("id").rdd.flatMap(lambda x: x).collect()

li = []
for i in fin:
    li.append(sorted(i))
#print(li)
reslen = sorted(list(set(len(x) for x in li)))
f = open(sys.argv[3], "w")
for i in reslen:
    can = []
    for j in li:
        if len(j) == i:
            can.append(j)
    can = sorted(can)
    for m in can:

        f.write(str(m).replace("[", "").replace("]", ""))
        f.write("\n")

end = time.time()

print("Duration", end - start)
# Lat and Lon for Las Vegas city
lat = 36.127430
lon = -115.138460
# Limit the Lat and Lon for easy visulization of results
lon_min, lon_max = lon - 0.3, lon + 0.5
lat_min, lat_max = lat - 0.4, lat + 0.5
# Logic to filter only Las Vegas records
yelp_LV = yelp_b.select(
    "city", yelp_b.latitude.cast("double"), yelp_b.longitude.cast("double"),
    yelp_b.stars.cast("double")).filter(yelp_b.city == "Las Vegas")
# Logic to select only records which are within the configured Lat and Lon Limit
yelp_LV_PlotInfo = yelp_LV.withColumn(
    'plotOnMap', (yelp_LV.latitude > lat_min) & (yelp_LV.latitude < lat_max) &
    (yelp_LV.longitude > lon_min) & (yelp_LV.longitude < lon_max))
yelp_LV_PlotInfo_True = yelp_LV_PlotInfo.filter(
    yelp_LV_PlotInfo.plotOnMap == "true")
# Logic to generate output as per Folium Mapping input Format
distinct_stars = yelp_b.select(yelp_b.stars).distinct()
distinct_stars_list = map(
    lambda x: x.stars,
    distinct_stars.select(distinct_stars.stars).collect())
data = []
for star in distinct_stars_list:
    subset = yelp_LV_PlotInfo_True.filter(yelp_LV_PlotInfo_True.stars == star)
    data.append(
        map(lambda x: [x.latitude, x.longitude],
            subset.select(subset.latitude, subset.longitude).collect()))
# Write the data to output File.
with open("VegasHeatMapData.txt", "w") as f:
    f.write(str(data))