예제 #1
0
    def create_corpus(self, table_name):
        self.odps.execute_sql("drop table if exists " + table_name)
        self.odps.execute_sql("create table %s (id string, content string) lifecycle 3" % table_name)

        upload_ss = self.tunnel.create_upload_session(table_name)
        writer = upload_ss.open_record_writer(0)

        for line_no, line in enumerate(load_resource_string('odps.examples.data', 'splited_words.txt').splitlines()):
            rec = upload_ss.new_record()
            cols = [line_no + 1, line.replace('####', '')]
            [rec.set(i, val) for i, val in enumerate(cols)]
            writer.write(rec)
        writer.close()
        upload_ss.commit([0, ])
예제 #2
0
    def create_iris_kv(self, table_name):
        self.odps.execute_sql("drop table if exists " + table_name)
        self.odps.execute_sql('create table %s (content string, category bigint) lifecycle 3' % table_name)

        upload_ss = self.tunnel.create_upload_session(table_name)
        writer = upload_ss.open_record_writer(0)

        for line in load_resource_string('odps.examples.data', 'iris.txt').splitlines():
            rec = upload_ss.new_record()
            line_parts = line.split(',')
            rec.set(0, ','.join('%s:%s' % (idx, c) for idx, c in enumerate(line_parts[:-1])))
            rec.set(1, 0 if 'setosa' in line_parts[-1] else 1)
            writer.write(rec)
        writer.close()
        upload_ss.commit([0, ])
예제 #3
0
    def create_ionosphere(self, table_name):
        fields = ','.join('a%02d double' % i for i in range(1, 35)) + ', class bigint'
        self.odps.execute_sql("drop table if exists " + table_name)
        self.odps.execute_sql("create table %s (%s) lifecycle 3" % (table_name, fields))

        upload_ss = self.tunnel.create_upload_session(table_name)
        writer = upload_ss.open_record_writer(0)

        for line in load_resource_string('odps.examples.data', 'ionosphere.txt').splitlines():
            rec = upload_ss.new_record()
            cols = [float(c) if rec._columns[i].type == 'double' else int(c) for i, c in enumerate(line.split(','))]
            [rec.set(i, val) for i, val in enumerate(cols)]
            writer.write(rec)
        writer.close()
        upload_ss.commit([0, ])
예제 #4
0
    def create_iris(self, table_name):
        self.odps.execute_sql("drop table if exists " + table_name)
        self.odps.execute_sql(('create table %s (sepal_length double, sepal_width double, petal_length double, '
                               + 'petal_width double, category string) lifecycle 3') % table_name)

        upload_ss = self.tunnel.create_upload_session(table_name)
        writer = upload_ss.open_record_writer(0)

        for line in load_resource_string('odps.examples.data', 'iris.txt').splitlines():
            rec = upload_ss.new_record()
            line_parts = line.split(',')
            cols = [float(c) for c in line_parts[:-1]]
            cols.append(line_parts[4])
            [rec.set(i, val) for i, val in enumerate(cols)]
            writer.write(rec)
        writer.close()
        upload_ss.commit([0, ])
예제 #5
0
    def create_splited_words(self, table_name):
        self.odps.execute_sql("drop table if exists " + table_name)
        self.odps.execute_sql("create table %s (id string, content string) lifecycle 3" % table_name)

        upload_ss = self.tunnel.create_upload_session(table_name)
        writer = upload_ss.open_record_writer(0)

        for line_no, line in enumerate(load_resource_string('odps.examples.data', 'splited_words.txt').splitlines()):
            if not line.strip():
                break
            for word in line.split('####'):
                rec = upload_ss.new_record()
                cols = [line_no + 1, word]
                [rec.set(i, val) for i, val in enumerate(cols)]
                writer.write(rec)
        writer.close()
        upload_ss.commit([0, ])
예제 #6
0
    def create_word_triple(self, table_name):
        self.odps.execute_sql("drop table if exists " + table_name)
        self.odps.execute_sql("create table %s (id string, word string, count bigint) lifecycle 3" % table_name)

        upload_ss = self.tunnel.create_upload_session(table_name)
        writer = upload_ss.open_record_writer(0)

        for line_no, line in enumerate(load_resource_string('odps.examples.data', 'splited_words.txt').splitlines()):
            line = line.strip()
            if not line:
                break
            for word, group in groupby(sorted(line.split('####'))):
                rec = upload_ss.new_record()
                cols = [str(line_no + 1), word, len(list(group))]
                [rec.set(i, val) for i, val in enumerate(cols)]
                writer.write(rec)
        writer.close()
        upload_ss.commit([0, ])
예제 #7
0
    def create_ionosphere_one_part(self, table_name, partition_count=3):
        fields = ','.join('a%02d double' % i for i in range(1, 35)) + ', class bigint'
        self.odps.execute_sql("drop table if exists " + table_name)
        self.odps.execute_sql("create table %s (%s) partitioned by (part bigint) lifecycle 3" % (table_name, fields))
        for part_id in range(partition_count):
            self.odps.execute_sql('alter table %s add if not exists partition (part=%d)' % (table_name, part_id))

        upload_sses = [self.tunnel.create_upload_session(table_name, 'part=%d' % part_id) for part_id in range(partition_count)]
        writers = [session.open_record_writer(0) for session in upload_sses]

        for line_no, line in enumerate(load_resource_string('odps.examples.data', 'ionosphere.txt').splitlines()):
            part_id = line_no % partition_count
            rec = upload_sses[part_id].new_record()
            cols = [float(c) if rec._columns[i].type == 'double' else int(c) for i, c in enumerate(line.split(','))]
            cols.append(part_id)
            [rec.set(i, val) for i, val in enumerate(cols)]
            writers[part_id].write(rec)
        [writer.close() for writer in writers]
        [upload_ss.commit([0, ]) for upload_ss in upload_sses]
예제 #8
0
    def create_ionosphere_two_parts(self, table_name, partition1_count=2, partition2_count=3):
        fields = ','.join('a%02d double' % i for i in range(1, 35)) + ', class bigint'
        self.odps.execute_sql("drop table if exists " + table_name)
        self.odps.execute_sql("create table %s (%s) partitioned by (part1 bigint, part2 bigint) lifecycle 3" % (table_name, fields))
        for id1, id2 in product(range(partition1_count), range(partition2_count)):
            self.odps.execute_sql('alter table %s add if not exists partition (part1=%d, part2=%d)' % (table_name, id1, id2))

        upload_sses = [[self.tunnel.create_upload_session(table_name, 'part1=%d,part2=%d' % (id1, id2))
                        for id2 in range(partition2_count)] for id1 in range(partition1_count)]
        writers = [[session.open_record_writer(0) for session in sessions] for sessions in upload_sses]

        for line_no, line in enumerate(load_resource_string('odps.examples.data', 'ionosphere.txt').splitlines()):
            id1, id2 = line_no % partition1_count, line_no % partition2_count
            rec = upload_sses[id1][id2].new_record()
            cols = [float(c) if rec._columns[i].type == 'double' else int(c) for i, c in enumerate(line.split(','))]
            cols.extend([id1, id2])
            [rec.set(i, val) for i, val in enumerate(cols)]
            writers[id1][id2].write(rec)
        [writer.close() for ws in writers for writer in ws]
        [upload_ss.commit([0, ]) for upload_sss in upload_sses for upload_ss in upload_sss]