def resolve_uid(self, x): """Parse HTML and update with URLs pointing to Plone objects. ex. url: "http://worpress.com/wp-content/uploads/2010/04/image.jpg" becomes: "resolveuid/c82a53270c904cfbbfd1a0d4cef90676" :param x: [required] Parsed Regex :type x: type Regex Match object :returns: the tag with an internal url :rtype: str """ start = x.group(1) # Start of tag ex.: '<img src="' url = x.group(2) # URL end = x.group(3) # End of tag ex.: '" />' url = fix_id(url) o = urlparse(url) internal_url = o.netloc == self.domain is_site_root = o.path == '' or o.path == '/' # links to external URL or to site root are ignored if not internal_url or is_site_root: return x.group(0) # return unchanged path = str(o.path).strip(' ').lstrip('/') obj = traverse(self.context, path, None) if obj is None: # object not found logger.warn('Could not resolve UUID: {0}'.format(url)) return x.group(0) # return unchanged # Create internal URL uuid = obj.UID() return '{0}resolveuid/{1}{2}'.format(start, uuid, end)
def _skip(row, skip): """Test if we will need to skip row processing by dealing with the following cases: - parsing errors - items with revision type - items with draft status - explicit request :param row: [required] row to be analized :type row: dictionary :param skip: [required] list of item ID to be explicitly skiped :type skip: list :returns: True if we will skip the row :rtype: bool """ if row['ID'] in skip: logger.info('Skipping row ID: ' + row['ID']) return True elif len(row) != 23 and 'publish' in row.values(): logger.warn('Parsing error on row ID: ' + row['ID']) return True elif row['post_type'] not in KNOWN_POST_TYPES: logger.warn('Parsing error on row ID: ' + row['ID']) return True elif row['post_type'] == 'revision': logger.debug('Revision type on row ID: ' + row['ID']) return True elif row['post_status'] == 'draft': logger.debug('Draft status on row ID: ' + row['ID']) return True return False
def __iter__(self): fetch_errors = [] # record all errors for item in self.previous: if '_guid' not in item: yield item continue url = item['_guid'] path = item['_path'] # TODO: read path key from options if not path: # not enough information yield item continue obj = self.context.unrestrictedTraverse( path.encode().lstrip('/'), None) # if object exists we will try to avoid downloading it again if obj is not None: if obj.portal_type not in ('File', 'Image'): # not an attachment yield item continue # request only the header to check it try: r = requests.head(url) except ConnectionError: # skip on connection error fetch_errors.append(url) yield item continue # content-length header could be missing if remote web # server is misconfigured for some mime types size = int(r.headers.get('content-length', 0)) if size == obj.size(): # already downloaded it yield item continue try: r = requests.get(url) except RequestException: # skip on timeouts and other errors fetch_errors.append(url) yield item continue if r.status_code != 200: # log error and skip item fetch_errors.append(url) msg = u'Error {0} when fetching {1}'.format(r.status_code, url) logger.warn(msg) yield item continue item['_data'] = r.content yield item
def add_related_content(self, obj, item): """Look into WordPress list of related content and create Plone related content list. :param obj: [required] object to add related content :type obj: type constructor parameter :param item: [required] transmogrify item :type item: dict """ # Get the string with URLs from related content pinged = item.get('_pinged', '') if pinged == '': return # No related content # The URL is formated with multiple URLs together without # separator. To break it into a list, I need to split on # http and reconstruct the url # TODO: handle HTTPS scheme related_urls = set('http{0}'.format(url.rstrip('/')) for url in pinged.split('http')[1:]) # Create a list of related items to update object's field related_items = [] for url in related_urls: # Parse URL and check domain url = fix_id(url) o = urlparse(url) if o.netloc != self.domain: continue path = str(o.path).strip(' ').lstrip('/') related_obj = traverse(self.context, path, None) if related_obj is None: # object not found logger.warn('Broken link: {0}'.format(url)) continue # Get related item ID intids = getUtility(IIntIds) to_id = intids.getId(related_obj) related_items.append(RelationValue(to_id)) # No related content if len(related_items) == 0: return obj.relatedItems = related_items
def __iter__(self): for item in self.previous: yield item filename = os.path.join(self.source, 'wp_posts.csv') assert os.path.isfile(filename), 'Missing file: ' + filename with open(filename) as csvfile: csv.field_size_limit(self.field_size_limit) reader = csv.DictReader(csvfile, **csv_options) for row in reader: if _skip(row, self.skip): # should we process this row? continue item = dict() post_type = row['post_type'] if post_type == 'post': # posts are imported as portal_type item['portal_type'] = self.portal_type elif post_type == 'page': # pages are imported as Page item['portal_type'] = 'Page' elif post_type == 'attachment': # attachments are imported as Image or File is_image = row['post_mime_type'].startswith('image') item['portal_type'] = 'Image' if is_image else 'File' item['_mimetype'] = row['post_mime_type'] item['_guid'] = row['guid'] # store for later if post_type != 'attachment': # for posts and pages the id is the post name item_id = row['post_name'] # Zope ids need to be ASCII item_id = fix_id(item_id) item['title'] = strip_tags(row['post_title']) else: # for attachments we need to parse the guid # and use the file name as title url = urlparse(row['guid']) item_id = item['title'] = url.path.split('/')[-1] item_id = fix_id(item_id) # on Zope ids can't start with "_" if bad_id(item_id) is not None: logger.warn('Invalid object id on row ID: ' + row['ID']) continue # WordPress stores only publication and modification times # we use publication date as creation date item['creation_date'] = item['effective_date'] = row['post_date'] item['modification_date'] = row['post_modified'] try: item['_path'] = self.get_path(row['ID'], item_id, post_type, item) except KeyError: # files defining taxonomies are probably outdated logger.warn('No taxonomies found for row ID: ' + row['ID']) continue item['description'] = row['post_excerpt'] # quotes are escaped; we need to fix that item['text'] = row['post_content'].replace('\\"', '"') # TODO: validate HTML to avoid post-processing surprises # use display_name instead of author_id, if match found author_id = row['post_author'] item['creators'] = self.display_names.get(author_id, author_id) if row['post_status'] == 'publish': item['_transitions'] = 'publish' item['_pinged'] = row['pinged'] # store for later yield item