示例#1
0
class HdfsClient:
    def __init__(self):
        self.client = Config().get_client('dev')

        try:
            self.client.list('datasets')
        except:
            self.client.makedirs('datasets')
示例#2
0
class PendingWindow(object):
    """docstring for PendingWindow"""
    def __init__(self, backup_dir, node):
        # TODO: not cut
        # each pending window (or node) only has a single downstream cut,
        # otherwise inconsistency occurs during truncating
        self.backup_dir = backup_dir
        self.node = node

        self.hdfs_client = Config().get_client('dev')

        self.hdfs_client.makedirs(self.backup_dir)

        # each backup file is named by the ending version, so the current writing one is named temporarily
        self.current_backup_path = os.path.join(self.backup_dir, 'current')
        # touch the file for later appending
        self.hdfs_client.write(self.current_backup_path, data='')

        # the version that last truncation conducted against
        self.safe_version_path = os.path.join(self.backup_dir, 'safe_version')
        # special case for initial version
        self.hdfs_client.write(self.safe_version_path, data=str(0))

        # the latest integral version
        self.latest_version_path = os.path.join(self.backup_dir,
                                                'latest_version')
        # special case for initial version
        self.hdfs_client.write(self.latest_version_path, data=str(0))

        if self.node.type != 'sink':
            self.version_acks = dict()
            for n in self.node.downstream_connectors:
                self.version_acks[n] = 0

    def append(self, tuple_):
        """Make an output tuple persistent, and complete a version if necessary
        """

        self.hdfs_client.write(self.current_backup_path,
                               data=pickle.dumps(tuple_),
                               append=True)

        if isinstance(tuple_, BarrierTuple):
            self.hdfs_client.rename(
                self.current_backup_path,
                os.path.join(self.backup_dir, str(tuple_.version)))
            self.hdfs_client.write(self.latest_version_path,
                                   data=str(tuple_.version),
                                   overwrite=True)
            self.hdfs_client.write(self.current_backup_path, data='')

    def extend(self, tuples):
        # TODO: can be improved
        with self.hdfs_client.write(self.current_backup_path,
                                    append=True) as f:
            for t in tuples:
                pickle.dump(t, f)

        if isinstance(tuples[-1], BarrierTuple):
            self.hdfs_client.rename(
                self.current_backup_path,
                os.path.join(self.backup_dir, str(tuples[-1].version)))
            self.hdfs_client.write(self.latest_version_path,
                                   data=str(tuples[-1].version),
                                   overwrite=True)
            self.hdfs_client.write(self.current_backup_path, data='')

    def truncate(self, version):
        """Delete files with filename <= version
        """
        # with self.hdfs_client.read(self.safe_version_path) as f:
        #     safe_version = int(f.read())
        #
        # # only = condition can occur
        # if version <= safe_version:
        #     return

        for f in self.hdfs_client.list(self.backup_dir):
            if f.isdigit() and int(f) <= version:
                self.hdfs_client.delete(os.path.join(self.backup_dir, f))

        # self.node.LOGGER.info('truncated version %d' % version)

    def handle_version_ack(self, version_ack):
        old_safe_version = min(self.version_acks.values())
        self.version_acks[version_ack.sent_from] = version_ack.version
        new_safe_version = min(self.version_acks.values())

        if new_safe_version > old_safe_version:
            self.hdfs_client.write(self.safe_version_path,
                                   data=str(new_safe_version),
                                   overwrite=True)
            self.truncate(new_safe_version)

    def get_latest_version(self):
        with self.hdfs_client.read(self.latest_version_path) as f:
            latest_version = int(f.read())
        return latest_version

    def rewind(self, version=None):
        """Delete files with filename > version (including current file)
        """

        if version == None:
            self.hdfs_client.write(self.current_backup_path,
                                   data='',
                                   overwrite=True)
            return

        # TODO: underflow
        # assert version == 0 or
        for f in self.hdfs_client.list(self.backup_dir):
            if f.isdigit() and int(f) > version:
                self.hdfs_client.delete(os.path.join(self.backup_dir, f))

        self.hdfs_client.write(self.current_backup_path,
                               data='',
                               overwrite=True)

        self.hdfs_client.write(self.latest_version_path,
                               data=str(version),
                               overwrite=True)

    def replay(self):
        """When both the node and pending window state are ready, replay the pending window before resuming
        """

        for v in sorted(
                map(
                    int,
                    filter(unicode.isdigit,
                           self.hdfs_client.list(self.backup_dir)))):
            # filter out the faster nodes
            tuples = []
            with self.hdfs_client.read(os.path.join(self.backup_dir,
                                                    str(v))) as f:
                while True:
                    try:
                        t = pickle.load(f)
                        tuples.append(t)
                    except EOFError:
                        self.node.LOGGER.debug(
                            'reached EOF, send this version')
                        break
                    # Spout needs version too, so that data source can resend from a version
                    # except pickle.UnpickleableError:
                    #     self.node.LOGGER.debug('spout reached partial dump location, send this incomplete version')
                    #     break
                self.node.multicast(self.node.downstream_nodes, tuples)
HDFS_BASE_URL = "hdfs://bdrenfdludcf01:9000"

if __name__ == "__main__":

    # Folder creation for placing all the spark data
    cmd_a = "mkdir -p " + "/tmp/SPARK_PROCESS/"
    os.system(cmd_a)

    # Configure Spark
    conf = SparkConf().setAppName(APP_NAME).set("spark.local.dir",
                                                "/tmp/SPARK_PROCESS/")

    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    client = Config().get_client('bdrenhdfs')
    files = client.list(HDFS_RAWFILE_DIR)
    totalfilecount = len(files)

    if totalfilecount == 0:
        print("There is no files to be processed, application exiting...")
        sys.exit(0)

    filecount = 0

    for filename in files:
        print(filename)
        if filename.find("Covid_Analysis_DataSet.csv") >= 0:
            filecount = filecount + 1
            df_covid = sqlContext.read.format("csv").option(
                "delimiter",
                ":").option("header", 'true').load(HDFS_BASE_URL +
示例#4
0
top_all = None

def print_rdd(rdd):
    global top_all
    top_all = rdd.take(10)  # 3 in fact
#     for row in top_all:
#          print('{}\t{}'.format(*row))


# In[3]:


# Эмулируем реальную жизнь, когда данные поступают частями с периодичностью
DATA_PATH = "/data/course4/uid_ua_100k_splitted_by_5k"
batches = [sc.textFile(os.path.join(*[nn_address, DATA_PATH, path])) for path in client.list(DATA_PATH)]  # формируем батчи из файлов датасета
#batches = batches[:2]
BATCH_TIMEOUT = 1 # раз в 5 с. посылаем батчи в виде RDD
ssc = StreamingContext(sc, BATCH_TIMEOUT)
ssc.checkpoint("./checkpoints")


dstream = ssc.queueStream(rdds=batches)

result = (dstream
    .flatMap(extract_segments)
)
#result.foreachRDD(print_rdd)
result.foreachRDD(recognize_finish)
(result
    .updateStateByKey(update_count)
示例#5
0
  'first_feature': 2.,
  'second_feature': 12.,
}

# First, we delete any existing `models/` folder on HDFS.
client.delete('models', recursive=True)

# We can now upload the data, first as CSV.
with client.write('models/1.csv', encoding='utf-8') as writer:
  for item in model.items():
    writer.write(u'%s,%s\n' % item)

# We can also serialize it to JSON and directly upload it.
with client.write('models/1.json', encoding='utf-8') as writer:
  dump(model, writer)

# We can check that the files exist and get their properties.
assert client.list('models') == ['1.csv', '1.json']
status = client.status('models/1.csv')
content = client.content('models/1.json')

# Later, we can download the files back. The `delimiter` option makes it
# convenient to read CSV files.
with client.read('models/1.csv', delimiter='\n', encoding='utf-8') as reader:
  items = (line.split(',') for line in reader if line)
  assert dict((name, float(value)) for name, value in items) == model

# Loading JSON directly from HDFS is even simpler.
with client.read('models/1.json', encoding='utf-8') as reader:
  assert load(reader) == model
示例#6
0
class Pickler:
    def __init__(self, sc, spark_session, uri, port):

        self.sc = sc
        self.spark_session = spark_session
        self.df = []
        self.models = []
        self.graphs = []
        self.base_path = uri + ":" + port
        self.local_pickle_path = os.path.dirname(
            os.path.realpath(__file__)) + '/../pickles/'

        self.pickle_path = '/user/hadoop/pickles/'
        self.model_path = '/user/hadoop/pickles/models/'

        self.dataset_path = self.pickle_path + "dataset/"
        self.private_release_path = self.dataset_path + "private/"
        self.anon_release_path = self.dataset_path + "github/"
        self.prod_release_path = self.dataset_path + "prod/"

        self.df_path = self.pickle_path + 'df/'
        self.graph_path = self.local_pickle_path + 'graphs/'
        self.labelled_df_path = self.df_path + 'labelled/'
        self.hdfs_client = Config().get_client('dev')

        self.load_df()
        self.load_models()
        self.load_graphs()

    #TODO: Implement generic methods for read dataset / model ONLY
    def read(self):
        pass

    def save(self):
        pass

    def getLabelledFiles(self):
        return self.hdfs_client.list(self.prod_release_path)

    def readCSVToDF(self, date, folder):
        return self.spark_session.read.option(
            "header",
            True).csv(self.base_path + self.dataset_path + folder + "/" + date)

    def getLabelledTelemetry(self):
        return self.hdfs_client.list(self.private_release_path)

    def existsModel(self, name):
        res = self.hdfs_client.list(self.model_path)
        file_extension = '.model'
        if name + file_extension in res:
            return True

    def getModel(self, name):
        return PipelineModel.load(self.base_path + self.model_path + name +
                                  ".model")

    def isDateLabelled(self, date):
        res = self.hdfs_client.list(self.prod_release_path)
        file_extension = ".csv"

        if date + file_extension in res:
            return True

        return False

    def load_graphs(self):
        for file in os.listdir(self.graph_path):

            if file.endswith(".pickle"):
                self.graphs.append(file[:-7])

    def existsGraph(self, date):
        if date in self.graphs:
            return True
        return False

    def getGraph(self, date):
        if date in self.graphs:
            with open(self.graph_path + date + ".pickle", 'rb') as pickle_file:
                content = pickle.load(pickle_file)

                return content

    def saveGraph(self, G, date):
        if date in self.graphs:
            return False

        nx.write_gpickle(G, self.graph_path + date + ".pickle")
        self.graphs.append(date)

    def existsDF(self, date, source):

        #2020.03.01_joy
        hash = self.getHash(date, source)
        if hash in self.df:
            return True

        return False

    def load_df(self):

        #Load Joy Data
        res = self.hdfs_client.list(self.df_path + 'joy')
        # print(f"Joy Items in directory: {res}")
        for file in res:

            if file.endswith(".parquet"):

                self.df.append(sha256(file[:-8].encode('utf-8')).hexdigest())

        #Load graph features DF
        res = self.hdfs_client.list(self.df_path + 'graph')
        #  print(f"Graph DF Items in directory: {res}")
        for file in res:

            if file.endswith(".parquet"):

                self.df.append(sha256(file[:-8].encode('utf-8')).hexdigest())

        res = self.hdfs_client.list(self.df_path + 'labelled')
        #   print(f"Labelled Items in directory: {res}")
        for file in res:

            if file.endswith(".parquet"):

                self.df.append(sha256(file[:-8].encode('utf-8')).hexdigest())

        # TODO : Load others?

    def saveModel(self, model, name):
        model.save(self.base_path + self.model_path + name + ".model")

    def load_models(self):
        res = self.hdfs_client.list(self.model_path)
        for file in res:
            if file.endswith(".model"):
                self.models.append(file.split('.')[0])

    def saveDFToCSV(self, df, date, folder, coalesced=False):

        if coalesced:
            df.coalesce(1).write.csv(self.base_path + self.pickle_path +
                                     "dataset/" + folder + '/' + date + '.csv',
                                     header=True)
        else:
            df.write.csv(self.base_path + self.pickle_path + "dataset/" +
                         folder + '/' + date + '.csv',
                         header=True)
            df.write.parquet(self.base_path + self.pickle_path + "dataset/" +
                             folder + '/' + date + '.parquet')

    def saveDF(self, df, date, source):
        hash = self.getHash(date, source)
        if hash in self.df:
            return False

        else:
            df.write.parquet(self.base_path + self.df_path + source + '/' +
                             date + "_" + source + '.parquet')
            self.df.append(hash)

    def getDF(self, date, source):

        hash = self.getHash(date, source)

        if hash in self.df:

            df = self.spark_session.read.parquet(self.base_path +
                                                 self.df_path + source + '/' +
                                                 date + "_" + source +
                                                 '.parquet')
            return df

        return False

    def getHash(self, date, source):

        id = date + "_" + source
        hash = sha256(id.encode('utf-8')).hexdigest()
        return hash
示例#7
0
class SparkHDFSClient(object):
    def __init__(self, datasource):
        self.datasource = datasource
        self.client = Config().get_client("dev")

    def get_file_list(self, folder):
        files = self.client.list(folder.strip())
        files = [folder + '/' + file for file in files]
        return files

    def list_collections(self):
        results = []
        status = self.client.status(self.datasource.url, strict=False)
        print(status, self.datasource.url)
        if status is not None:
            if status['type'] == "DIRECTORY":
                files = self.get_file_list(self.datasource.url)
                while len(files) > 0:
                    file = files.pop()
                    status = self.client.status(os.path.join(
                        self.datasource.url, file),
                                                strict=False)
                    if status is None:
                        continue
                    if status['type'] == "DIRECTORY":
                        subfiles = self.get_file_list(
                            os.path.join(self.datasource.url, file))
                        files.extend(subfiles)
                        continue
                    else:
                        if self.datasource.dstype == DataSourceType.SPARK_CSV and file[-2:] != 'sv' \
                                or self.datasource.dstype == DataSourceType.SPARK_TSV and file[-2:] != 'sv'\
                                or self.datasource.dstype == DataSourceType.SPARK_XML and file[-3:] != 'xml'\
                                or self.datasource.dstype == DataSourceType.SPARK_JSON and file[-4:] != 'json':
                            continue
                        row = {
                            "db":
                            file[:file.rfind('/')]
                            if '/' in file else self.datasource.url,
                            "document":
                            file[file.rfind('/') +
                                 1:] if '/' in file else file,
                            "count":
                            -1
                        }
                        results.append(row)

                return results
            else:
                return [{
                    "db": self.datasource.url,
                    "document": self.datasource.url,
                    "count": -1
                }]
        else:
            return results

    def get_documents(self, filename, limit=10):
        results = []
        delimiter = "\n"
        header = None
        rows = 0
        if self.datasource.dstype == DataSourceType.SPARK_CSV or \
                self.datasource.dstype == DataSourceType.SPARK_TSV:
            delimiter = "\n"
            with self.client.read(filename,
                                  encoding='utf-8',
                                  delimiter=delimiter) as reader:
                for line in reader:
                    if len(line.strip()) == 0 or line[0] == '#':
                        continue
                    if filename[-3:] == "csv":
                        line = line.split(',')
                    else:
                        line = line.split('\t')

                    if header is None:
                        header = line
                        continue
                    res = {
                        header[i]: line[i]
                        for i in range(len(line)) if i < len(header)
                    }
                    results.append(res)
                    rows += 1
                    if rows > limit + 1:
                        break
        elif self.datasource.dstype == DataSourceType.SPARK_XML:
            with self.client.read(filename, encoding='utf-8',
                                  chunk_size=2048) as reader:
                header = ['content']
                for chunk in reader:
                    res = {'content': str(chunk)}
                    results.append(res)
                    print(results)
                    break
        elif self.datasource.dstype == DataSourceType.SPARK_JSON:
            with self.client.read(filename, encoding='utf-8') as reader:
                model = load(reader)
                if isinstance(model, list):
                    model = [{
                        p:
                        str(list(md[p][0].keys())) if isinstance(md[p], list)
                        and isinstance(md[p][0], dict) else str(model[p])
                        if isinstance(md[p], list) else str(list(md[p].keys()))
                        if isinstance(md[p], dict) else md[p]
                        for p in md
                    } for md in model]
                    results.extend(model)
                else:
                    model = {
                        p: str(list(model[p][0].keys()))
                        if isinstance(model[p], list) and isinstance(
                            model[p][0], dict) else model[p] if isinstance(
                                model[p], list) else str(list(model[p].keys()))
                        if isinstance(model[p], dict) else model[p]
                        for p in model
                    }
                    results.append(model)

        return results[:limit], limit
示例#8
0
###############
### Setting up File Paths and Lists
###############

client = Config().get_client('dev')

workingFolder_Indian = "SgIndian_vcf/dataFreeze_Feb2013/SNP/biAllele/"

workingFolder_Malay = "SgMalay_vcf/2012_05/snps/"

workingFolder_Chinese = "1000G_CDX/Phase3/integrated/"

# Filing number of unique samples found in the working folder...

freqFiles_Indian = [
    f for f in client.list(workingFolder_Indian)
    if re.match(r'chr\d+_analysis_exome\.frq', f)
]
rsIDFiles_Indian = [
    f for f in client.list(workingFolder_Indian)
    if re.match(r'chr\d+_rsID', f)
]
freqFiles_Malay = [
    f for f in client.list(workingFolder_Malay)
    if re.match(r'chr\d+_analysis_exome\.frq', f)
]
rsIDFiles_Malay = [
    f for f in client.list(workingFolder_Malay) if re.match(r'chr\d+_rsID', f)
]
freqFiles_Chinese = [
    f for f in client.list(workingFolder_Chinese)