def get_articles(self, fn, media): csv.field_size_limit(sys.maxsize) def _int(x): return int(x) if x else None def hash2binary(hash): if hash: if not isinstance(hash, str): raise TypeError("Hash should be str, not {}".format(type(hash))) return "\\x" + hash r = csv.reader(open(fn)) header = next(r) index = {col: i for (i, col) in enumerate(header)} AID = index['article_id'] if self.maxid: logging.info("*** max(id) set by user: {self.maxid}".format(**locals())) max_id, self.n_rows = self.maxid, self.maxid else: logging.info("*** Scan input CSV to determine #rows and max(id)") for row in r: max_id = max(max_id, int(row[AID])) self.n_rows += 1 if not self.n_rows % 10000000: logging.info(".. scanned {self.n_rows} rows".format(**locals())) self.maxid = max_id logging.info("{self.n_rows} rows, max ID {max_id}, allocating memory for hashes".format(**locals())) hashes = ctypes.create_string_buffer(max_id*28) NULL_HASH = b'\x00' * 28 orphans = "PLENTY" passno = 1 if self._continue: logging.info("Continuing from previous migration, getting state from DB") with conn().cursor('migration-continue') as c: c.itersize = 10000 # how much records to buffer on a client c.execute("SELECT article_id, hash FROM articles") i = 0 while True: rows = c.fetchmany(10000) if not rows: break i += len(rows) if not i % 1000000: logging.info("Retrieved {i} rows...".format(**locals())) for (aid, hash) in rows: offset = (aid - 1) * 28 hashes[offset:offset+28] = hash self.n_rows -= i logging.info("Continuing migration, {i} articles retrieved, up to {self.n_rows} to go".format(**locals())) while orphans: norphans = len(orphans) if isinstance(orphans, list) else orphans logging.info("*** Pass {passno}, #orphans {norphans}".format(**locals())) passno += 1 if orphans == "PLENTY": r = csv.reader(open(fn)) next(r) # skip header todo = r else: todo = orphans orphans = [] MAX_ORPHANS_BUFFER = 50000 for i, row in enumerate(todo): if not i % 1000000: norphans = len(orphans) if isinstance(orphans, list) else orphans logging.info("Row {i}, #orphans: {norphans}".format(**locals())) aid = int(row[AID]) offset = (aid - 1) * 28 stored_hash = hashes[offset:offset+28] if stored_hash != NULL_HASH: continue parent_id = _int(row[index['parent_article_id']]) if (parent_id == aid) or (parent_id in SKIP_PARENTS): parent_id = None if parent_id: poffset = (parent_id - 1) * 28 parent_hash = hashes[poffset:poffset+28] if parent_hash == NULL_HASH: # it's an orphan, can't process it now, so either buffer or re-iterate if orphans != "PLENTY": # try to buffer if len(orphans) > MAX_ORPHANS_BUFFER: orphans = "PLENTY" else: orphans.append(row) continue parent_hash = binascii.hexlify(parent_hash).decode("ascii") else: parent_hash = None date = row[index['date']] date = date.split("+")[0] date = datetime.strptime(date[:19], '%Y-%m-%d %H:%M:%S') a = Article( project_id = row[index['project_id']], date = date, title = row[index['headline']], url = row[index['url']] or None, text = row[index['text']], parent_hash=parent_hash) a.properties = {v: row[index[v]] for v in PROP_FIELDS if row[index[v]]} a.properties['medium'] = media[int(row[index['medium_id']])] a.properties['uuid'] = str(a.properties['uuid']) props = json.dumps(a.properties) hash = amcates.get_article_dict(a)['hash'] hashes[offset:offset+28] = binascii.unhexlify(hash) yield (a.project_id, aid, a.date, a.title, a.url, a.text, hash2binary(hash), hash2binary(a.parent_hash), props)
def get_articles(self, fn, media): csv.field_size_limit(sys.maxsize) def _int(x): return int(x) if x else None def hash2binary(hash): if hash: if not isinstance(hash, str): raise TypeError("Hash should be str, not {}".format( type(hash))) return "\\x" + hash r = csv.reader(open(fn)) header = next(r) index = {col: i for (i, col) in enumerate(header)} AID = index['article_id'] if self.maxid: logging.info( "*** max(id) set by user: {self.maxid}".format(**locals())) max_id, self.n_rows = self.maxid, self.maxid else: logging.info("*** Scan input CSV to determine #rows and max(id)") for row in r: max_id = max(max_id, int(row[AID])) self.n_rows += 1 if not self.n_rows % 10000000: logging.info( ".. scanned {self.n_rows} rows".format(**locals())) logging.info( "{self.n_rows} rows, max ID {max_id}, allocating memory for hashes" .format(**locals())) hashes = ctypes.create_string_buffer(max_id * 28) NULL_HASH = b'\x00' * 28 orphans = "N/A" passno = 1 if self._continue: logging.info( "Continuing from previous migration, getting state from DB") c = conn().cursor('migration-continue') c.itersize = 10000 # how much records to buffer on a client c.execute("SELECT article_id, hash FROM articles") i = 0 while True: rows = c.fetchmany(10000) if not rows: break i += len(rows) if not i % 1000000: logging.info("Retrieved {i} rows...") for (aid, hash) in rows: offset = (aid - 1) * 28 hashes[offset:offset + 28] = hash self.n_rows -= i logging.info( "Continuing migration, {i} articles retrieved, {self.n_rows} to go" .format(**locals())) while orphans: logging.info( "*** Pass {passno}, #orphans {orphans}".format(**locals())) passno += 1 orphans = 0 r = csv.reader(open(fn)) next(r) # skip header for row in r: aid = int(row[AID]) offset = (aid - 1) * 28 stored_hash = hashes[offset:offset + 28] if stored_hash != NULL_HASH: continue parent_id = _int(row[index['parent_article_id']]) if (parent_id == aid) or (parent_id in SKIP_PARENTS): parent_id = None if parent_id: poffset = (parent_id - 1) * 28 parent_hash = hashes[poffset:poffset + 28] if parent_hash == NULL_HASH: orphans += 1 continue parent_hash = binascii.hexlify(parent_hash).decode("ascii") else: parent_hash = None date = row[index['date']] date = date.split("+")[0] date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') a = Article(project_id=row[index['project_id']], date=date, title=row[index['headline']], url=row[index['url']] or None, text=row[index['text']], parent_hash=parent_hash) a.properties = { v: row[index[v]] for v in PROP_FIELDS if row[index[v]] } a.properties['medium'] = media[int(row[index['medium_id']])] a.properties['uuid'] = str(a.properties['uuid']) props = json.dumps(a.properties) hash = amcates.get_article_dict(a)['hash'] hashes[offset:offset + 28] = binascii.unhexlify(hash) yield (a.project_id, aid, a.date, a.title, a.url, a.text, hash2binary(hash), hash2binary(a.parent_hash), props)