def import_samples_from_xls(self, xls_file): rows = load_excel_to_rows(xls_file) num_samples = len(rows) sample_id = self.acquire_sample_id(num_samples) batch_content = leveldb.WriteBatch() for row in rows: content = None if u"CONTENT" in row: content = row[u"CONTENT"] title = None if u"TITLE" in row: title = row[u"TITLE"] sample_data = (sample_id, title, content) rowstr = msgpack.dumps(sample_data) batch_content.Put(str(sample_id), rowstr) if sample_id % 100 == 0: logging.debug(Logger.debug("(%d/%d) %s" % (sample_id, num_samples, title))) sample_id += 1 self.db.Write(batch_content, sync=True)
def import_samples_from_xls(samples, categories, xls_file): corpus = samples.corpus rows = load_excel_to_rows(xls_file) num_samples = len(rows) sample_id = corpus.acquire_sample_id(num_samples) batch_content = leveldb.WriteBatch() for row in rows: category = -1 if u"CATEGORY" in row: category = int(row[u"CATEGORY"]) content = row[u"CONTENT"] if content == "": content = None title = "" if u"TITLE" in row: title = row[u"TITLE"] date = datetime.now() if u"DATE" in row: row_date = row[u"DATE"] if row_date.__class__ is unicode: #date = row_date.decode('utf-8') y, m, d = row_date.split('.') date = datetime(int(y), int(m), int(d)) else: date = xldate_to_datetime(row_date) #date = str(row_date).decode('utf-8') key = "" if u"KEY" in row: key = row[u"KEY"] if key.__class__ != str: key = str(key).decode('utf-8') url = "" if u"URL" in row: url = row[u"URL"] cat1 = "" if u"CAT1" in row: cat1 = row[u"CAT1"].strip() cat2 = "" if u"CAT2" in row: cat2 = row[u"CAT2"].strip() cat3 = "" if u"CAT3" in row: cat3 = row[u"CAT3"].strip() version = "1" msgext = (version, content, (cat1, cat2, cat3)) category_id = categories.create_or_get_category_id(cat1, cat2, cat3) sample_data = (sample_id, category_id, (date.year, date.month, date.day, date.hour, date.minute, date.second), title, key, url, msgext) rowstr = msgpack.dumps(sample_data) batch_content.Put(str(sample_id), rowstr) if sample_id % 100 == 0: logging.debug(Logger.debug("Row: %d/%d %s %s" % (sample_id, len(rows), date, title))) sample_id += 1 return batch_content