Exemplo n.º 1
0
    def import_samples_from_xls(self, xls_file):
        rows = load_excel_to_rows(xls_file)
        num_samples = len(rows)
        sample_id = self.acquire_sample_id(num_samples)

        batch_content = leveldb.WriteBatch()
        for row in rows:
            content = None
            if u"CONTENT" in row:
                content = row[u"CONTENT"]

            title = None
            if u"TITLE" in row:
                title = row[u"TITLE"]

            sample_data = (sample_id, title, content)
            rowstr = msgpack.dumps(sample_data)
            batch_content.Put(str(sample_id), rowstr)

            if sample_id % 100 == 0:
                logging.debug(Logger.debug("(%d/%d) %s" % (sample_id, num_samples, title)))
            sample_id += 1

        self.db.Write(batch_content, sync=True)
Exemplo n.º 2
0
def import_samples_from_xls(samples, categories, xls_file):
    corpus = samples.corpus

    rows = load_excel_to_rows(xls_file)
    num_samples = len(rows)
    sample_id = corpus.acquire_sample_id(num_samples)

    batch_content = leveldb.WriteBatch()
    for row in rows:
        category = -1
        if u"CATEGORY" in row:
            category = int(row[u"CATEGORY"])

        content = row[u"CONTENT"]
        if content == "":
            content = None

        title = ""
        if u"TITLE" in row:
            title = row[u"TITLE"]

        date = datetime.now()
        if u"DATE" in row:
            row_date = row[u"DATE"]
            if row_date.__class__ is unicode:
                #date = row_date.decode('utf-8')
                y, m, d = row_date.split('.')
                date = datetime(int(y), int(m), int(d))
            else:
                date = xldate_to_datetime(row_date)
                #date = str(row_date).decode('utf-8')

        key = ""
        if u"KEY" in row:
            key = row[u"KEY"]
            if key.__class__ != str:
                key = str(key).decode('utf-8')

        url = ""
        if u"URL" in row:
            url = row[u"URL"]

        cat1 = ""
        if u"CAT1" in row:
            cat1 = row[u"CAT1"].strip()

        cat2 = ""
        if u"CAT2" in row:
            cat2 = row[u"CAT2"].strip()

        cat3 = ""
        if u"CAT3" in row:
            cat3 = row[u"CAT3"].strip()

        version = "1"
        msgext = (version, content, (cat1, cat2, cat3))

        category_id = categories.create_or_get_category_id(cat1, cat2, cat3)

        sample_data = (sample_id, category_id, (date.year, date.month, date.day, date.hour, date.minute, date.second), title, key, url, msgext)
        rowstr = msgpack.dumps(sample_data)
        batch_content.Put(str(sample_id), rowstr)

        if sample_id % 100 == 0:
            logging.debug(Logger.debug("Row: %d/%d %s %s" % (sample_id, len(rows), date, title)))
        sample_id += 1

    return batch_content