Exemplo n.º 1
0
def _download(resource_type, resource_url, resource_target):
    """Does the actual downloading."""
    if resource_type == 'feed':
        _hl2 = httplib2.Http(cache="../../cache/httplib2/feed", timeout=5)
    if resource_type == '':
        _hl2 = httplib2.Http(cache="../../cache/httplib2/image", timeout=5)
    if not _hl2:
        print resource_type
        raise # not yet implemented
    _logger = LoggerFactory().getLogger('_download')
    try:
        resp, content = _hl2.request(resource_url)
        if  resp.fromcache:
            msg = "Cache contained a current version of %s %s." % (resource_type, resource_url)
            _logger.info(msg)
        else:
            msg = "Downloaded %s from %s to %s." % (resource_type, resource_url, resource_target)
            _logger.info(msg)
            with open(resource_target, 'w') as f:
                content = Decoder().decode(content)
                f.write(content)
    except (AttributeError, IOError, TypeError, UnicodeError, ValueError, \
            urllib2.IncompleteRead, urllib2.InvalidURL, urllib2.BadStatusLine, \
            httplib2.RelativeURIError, httplib2.RedirectLimit, \
            httplib2.ServerNotFoundError) as e:
        # TODO actually do some error handling here
        print(e)
Exemplo n.º 2
0
class PathTool:
    """Has knowledge about our conventions regarding the file system layout."""

    _logger = LoggerFactory().getLogger('PathTool')

    def __init__(self):
        PathTool._logger.debug('Initializing.')
        relativeProjectRoot = "../../"
        self._directoriesPath = relativeProjectRoot + "static/0-Directories/"
        self._feedListsPath = relativeProjectRoot + "static/1-Feedlists/"
        self._feedsPath = relativeProjectRoot + 'static/2-Feeds/'
        self._imagesPath = relativeProjectRoot + "web/img/"
        PathTool._logger.debug('Initialized.')

    def getDirectoriesPath(self):
        return self._directoriesPath

    def getFeedsPath(self):
        return self._feedsPath

    def getFeedListsPath(self):
        return self._feedListsPath

    def getImagesPath(self):
        return self._imagesPath
Exemplo n.º 3
0
class Decoder:

    _logger = LoggerFactory().getLogger('Decoder')

    knownEncodings = [
        "UTF-8", "ASCII", "UTF-16", "UTF-32", "Big5", "GB2312", "EUC-TW",
        "HZ-GB-2312", "ISO-2022-CN", "EUC-JP", "SHIFT_JIS", "ISO-2022-JP",
        "EUC-KR", "ISO-2022-KR", "KOI8-R", "MacCyrillic", "IBM855", "IBM866",
        "ISO-8859-5", "windows-1251", "ISO-8859-2", "windows-1250",
        "ISO-8859-5", "windows-1251", "windows-1252", "ISO-8859-7",
        "windows-1253", "ISO-8859-8", "windows-1255", "TIS-620"
    ]

    def __init__(self):
        pass

    def decode(self, byteString):
        #encoding = chardet.detect(byteString)

        for encoding in self.knownEncodings:
            try:
                utf8String = byteString.decode(encoding)
                return utf8String
            except Exception as e:
                msg = "Was not able to decode using encoding %s, got error %s" \
                       % (encoding, e)
                self._logger.debug(msg)

        self._logger.warning('Was not able to decode with any known decoding.')
Exemplo n.º 4
0
class ResourceDownloader:
    """Commonly used tool that downloads resources."""

    _logger = LoggerFactory().getLogger('RessourceDownloader')
    _resources = []
    _downloadedResources = []

    def __init__(self):
        self._tdr = Threader()
        self._pt = PathTool.PathTool()
        self._rc = ResourceChecker()
        self._rh = ResourceHelper()
        self.last_download_timestamp = 0

    def download(self, resource_type, resource_url):
        """Downloads a resource of type feed or image by its URL."""

        if not self._rc.check_remote_resource(resource_type, resource_url):
            return

        resource = Resource(resource_url, resource_type)
        if resource.get_absolute_url().endswith('/'):
            resource._set_url(resource.get_absolute_url()[:-1])
        resource_target = resource.get_path()
        base_path = resource.get_base_path()
        msg = 'DEBUG: Will download resource %s with target %s to location %s.' \
              % (resource_url, resource_target, base_path)
        ResourceDownloader._logger.info(msg)

        self._rh.ensurePathExists(base_path)

        args = [resource_type, resource_url, resource_target]

        duplicate_found = False
        if not duplicate_found:
            for dedup_args in ResourceDownloader._resources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            for dedup_args in ResourceDownloader._downloadedResources:
                if dedup_args[2] == args[2]:
                    duplicate_found = True
                    break
        if not duplicate_found:
            ResourceDownloader._resources.append(args)

        time_since_last_download = time.time() - self.last_download_timestamp
        # download 300 files in parallel or how many ever we have every minute
        if len(ResourceDownloader._resources
               ) <= 1000 and time_since_last_download <= 60:  # TODO
            return

        resources_tmp = ResourceDownloader._resources
        ResourceDownloader._resources = []
        ResourceDownloader._downloadedResources = ResourceDownloader._downloadedResources + resources_tmp
        self.last_download_timestamp = time.time()
        self._tdr.run_parallel_in_threads(_download, resources_tmp)
Exemplo n.º 5
0
def _download(resource_type, resource_url, resource_target):
    """Does the actual downloading."""
    if resource_type == 'feed':
        _hl2 = httplib2.Http(cache="../../cache/httplib2/feed", timeout=5)
    if resource_type == '':
        _hl2 = httplib2.Http(cache="../../cache/httplib2/image", timeout=5)
    if not _hl2:
        print resource_type
        raise  # not yet implemented
    _logger = LoggerFactory().getLogger('_download')
    try:
        resp, content = _hl2.request(resource_url)
        if resp.fromcache:
            msg = "Cache contained a current version of %s %s." % (
                resource_type, resource_url)
            _logger.info(msg)
        else:
            msg = "Downloaded %s from %s to %s." % (
                resource_type, resource_url, resource_target)
            _logger.info(msg)
            with open(resource_target, 'w') as f:
                content = Decoder().decode(content)
                f.write(content)
    except (AttributeError, IOError, TypeError, UnicodeError, ValueError, \
            urllib2.IncompleteRead, urllib2.InvalidURL, urllib2.BadStatusLine, \
            httplib2.RelativeURIError, httplib2.RedirectLimit, \
            httplib2.ServerNotFoundError) as e:
        # TODO actually do some error handling here
        print(e)
Exemplo n.º 6
0
class ResourceHelper:

    _logger = LoggerFactory().getLogger('RessourceHelper')
    _pt = PathTool.PathTool()
    _rc = ResourceChecker()

    def __init__(self):
        pass

    def ensurePathExists(self, path):
        """Makes sure a given path exists.
        Tries to create the given path, handles eventual failure.
        See: http://stackoverflow.com/questions/600268/mkdir-p-functionality-in-python"""

        ResourceHelper._logger.info('Ensuring path %s exists.' % path)

        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST: return
            if exc.errno == errno.ENOTDIR: return
            raise
        return

    def stripWhitespace(self, filename):
        """Substitutes all space literals (' ', '\n', '\t' etc.) with nothing."""
        filename = ' '.join(filename.split())
        return filename

    def getAllFeedPaths(self):
        """Gathers all feed paths"""
        feedsPath = self._pt.getFeedsPath()
        print feedsPath
        relativeFeedFilePaths = []
        for root, dirs, files in os.walk(feedsPath):
            for filePath in files:
                relativePath = os.path.join(root, filePath)
                if not self._rc.check_local_resource(relativePath, 'feed'):
                    print "Skipping %s.", relativePath
                relativeFeedFilePaths.append(relativePath)
            if '/me/' in root:
                break
        return relativeFeedFilePaths
Exemplo n.º 7
0
class ImagesDownloader:
    _projectRoot = '../../../'

    _logger = LoggerFactory().getLogger('ImagesDownloader')
    _rd = ResourceDownloader()

    def __init__(self):
        pass

    def handleFeed(self, relativeFeedFilePath):
        """Downloads a single image gathered from a given feed."""
        imgUrl = self.getImageUrl(relativeFeedFilePath)
        self.downloadImage(imgUrl)

    def getImageUrl(self, podcastFeedFilePath):
        p = self.parsePodcast(podcastFeedFilePath)
        if not p: return

        try:
            imageUrl = p.feed.image.href
        except AttributeError:
            warnString = 'Feed %s did not contain an image.' \
                         % podcastFeedFilePath
            ImagesDownloader._logger.warn(warnString)
            return

        msg = "Parsed image URL '%s' from feed '%s'" % (imageUrl,
                                                        podcastFeedFilePath)
        ImagesDownloader._logger.info(msg)
        return imageUrl

    def parsePodcast(self, podcast):
        try:
            podcast = feedparser.parse(podcast)
            return podcast
        except (UnicodeDecodeError, IndexError):
            msg = "Feed '%s' contains undecodable characters." % podcast
            ImagesDownloader._logger.warn(msg)

    def downloadImage(self, imgUrl):
        ImagesDownloader._rd.download("image", imgUrl)
Exemplo n.º 8
0
class FeedsDownloaderRunner:
    """Runs the DownloadTool with URLs of feeds, gathered from the feed lists."""

    _fd = FeedsDownloader()
    _pt = PathTool.PathTool()
    _rh = ResourceHelper()
    _logger = LoggerFactory().getLogger('FeedsDownloaderRunner')
    
    def __init__(self):
        FeedsDownloaderRunner._logger.debug('Initialized.')

    def run(self):
        """Runs downloads of all feeds."""
        feed_urls = self.get_all_feed_urls()
        self.download_feeds(feed_urls)
    
    def handle_single_feed_list(self, feed_list_path):
        """Runs downloads for one feed list."""
        feed_urls = self.get_feed_urls_from_feed_list(feed_list_path)
        self.download_feeds(feed_urls)
        
    def download_feeds(self, feed_urls):
        """Runs downloads of specified feeds."""
        FeedsDownloaderRunner._logger.info('Starting downloads for %s feeds.' % len(feed_urls))
        self._fd.downloadFeeds(feed_urls)
        FeedsDownloaderRunner._logger.info('FeedsDownloaderRunner: INFO: Done.')
        
    def get_all_feed_urls(self):
        """Collects all URLs of feeds from the lists of feeds."""
        
        feed_lists_directory = self._pt.getFeedListsPath()
        relative_feed_lists_paths = os.listdir(feed_lists_directory)
        all_feed_urls = []
        for relative_feed_list_path in relative_feed_lists_paths:
            if relative_feed_list_path == 'podster.list':
                continue
            if relative_feed_list_path == 'podcast.com.json':
                continue
            some_feed_urls = self.get_feed_urls_from_feed_list(relative_feed_list_path)
            for feed_url in some_feed_urls:
                feed_url = self._rh.stripWhitespace(feed_url)
                all_feed_urls.append(feed_url)
        return all_feed_urls
    
    def get_feed_urls_from_feed_list(self, feed_list_path):
        """Parses all feed urls from download_feeds list of feeds by its path."""
        
        feed_lists_directory = self._pt.getFeedListsPath()
        absolute_feed_list_path = feed_lists_directory + feed_list_path
        if feed_list_path.endswith('.json'):
            feed_urls = self.get_feed_urls_from_json_feed_list(absolute_feed_list_path)
        else:
            feed_urls = self.get_feed_urls_from_text_feed_list(absolute_feed_list_path)
        return feed_urls
        
    def get_feed_urls_from_json_feed_list(self, absolute_feed_list_path):
        feed_urls = []
        print(absolute_feed_list_path)
        with open(absolute_feed_list_path, 'r') as f:
            contents = f.read()
            feed_items = json.loads(contents)
            for feed_item in feed_items:
                feed_urls.append(feed_item['link'])
        return feed_urls

    def get_feed_urls_from_text_feed_list(self, absolute_feed_list_path):
        feed_urls = []
        with open(absolute_feed_list_path, 'r') as f:
            for line in f.readlines():
                feed_urls.append(self._rh.stripWhitespace(line))
        return feed_urls
Exemplo n.º 9
0
class ResourceChecker:
    """Checks resources for sanity on various levels."""

    _logger = LoggerFactory().getLogger('ResourceChecker')

    # really valid is only 'application/rss+xml'.
    # see http://stackoverflow.com/questions/595616/
    # not quite right but also not toxic
    not_quite_right_feed_mime_types = [
        'text/html',  # test for HTML later
        'text/plain',
        'text/x-php',  # test for php later
        'text/x-c++'
    ]  # test for c++ later
    not_quite_wrong_feed_mime_types = [
        'application/rss+xml', 'application/xml', 'text/xml'
    ]
    valid_feed_mime_types = not_quite_wrong_feed_mime_types

    def __init__(self):
        self._pt = PathTool.PathTool()

    def check_remote_resource(self, resource_type, url):
        """Checks the URL for sanity according to the resource_type.
        Returns True if the URL is sane for this resource_type, otherwise
        False."""

        if not url:
            return False
        if not resource_type:
            return False  # TODO we should raise here

        if not self._check_general_url(url):
            return False

        if resource_type == 'feed':
            sanity = self._check_feed_url(url)
        elif resource_type == 'image':
            sanity = self._check_image_url(url)

        if sanity:
            sanity = self._check_remote_existence(url)

        return sanity

    def _check_general_url(self, url):
        """Checks an URL for sanity. Returns True if the URL is sane, otherwise
        False.
        >>> from ResourceChecker import ResourceChecker
        >>> resource_checker = ResourceChecker()
        >>> resource_checker._check_general_url("http://example.com")
        True"""

        url_validator = UrlValidator()
        sanity = url_validator.validate(url)

        return sanity

    def _check_image_url(self, url):
        """Checks an image URL for sanity. Returns True if the URL is sane,
        otherwise False."""

        # We skip dataUrls
        if url.startswith('data:'):
            return False

        sanity = self._check_remote_image_mime_type(url)

        return sanity

    def _check_feed_url(self, url):
        """Checks an URL of a feed for sanity. Returns True if the URL is sane,
        otherwise False."""

        if self._check_feed_url_file_type(url):
            return True

        resource_type = 'feed'

        feed_filename = Resource(url, resource_type).get_filename()
        if not self._check_remote_feed_mime_type(feed_filename):
            return False

        sanity = self._check_remote_feed_mime_type(url)
        if not sanity:
            msg = "Did not recognize mime media type of feed %s." % (url)
            ResourceChecker._logger.warn(msg)
        return sanity

    def _check_feed_url_file_type(self, url):
        """"""
        # Google's Feedburner always does the right thing.^tm
        if url.startswith('http://feeds.feedburner.com/'):
            return True

        # If the URL seems sane, we believe that, too.
        if url.endswith('/rss'):
            return True

        return False

    def check_local_resource(self, path, resourceType):
        """Checks a local resource of any kind for sanity.
        
        TODO doctest
        """
        if resourceType == "directory":
            raise  # not yet implemented
        if resourceType == "feed":
            return self._check_local_feed(path)
        if resourceType == "image":
            raise  # not yet implemented
            #return self._checkLocalImage(path)

    def _check_local_feed(self, feed_path):
        """Checks a path of a local feed for sanity."""

        # from cheapest to most expensive
        if os.path.isdir(feed_path):
            return False
        if self._check_local_feed_mime_type(feed_path):
            return True
        if self._check_local_feed_magic(feed_path):
            return True
        #if self._check_local_feed_bad_ends(feed_path):
        #return False
        #TODO check for binary
        return False

    def _check_remote_feed_mime_type(self, filename):
        """"""
        return self._check_local_feed_mime_type(filename)

    def _check_local_feed_mime_type(self, filename):
        """Checks mimetype of a given file by looking at its filename."""
        mimetype, encoding = mimetypes.guess_type(filename, strict=False)
        if mimetype in self.valid_feed_mime_types:
            return True
        return False

    def _check_local_feed_magic(self, feed_path):
        """Checks the mimetype of a given local file by looking at the file
        header."""
        mymagic = magic.Magic(mime=True)
        mimetype = mymagic.from_file(feed_path.encode('UTF-8'))
        mimetype = mimetype.decode()
        mimetype = str(mimetype).split('; ')
        mimetype = mimetype[0]
        if mimetype in self.valid_feed_mime_types:
            return True
        msg = "Skipping %s %s." % (mimetype, feed_path)
        ResourceChecker._logger.warn(msg)
        return False

    def _check_local_image_mime_type(self, filename):
        sanity = self._check_image_mime_type(filename)
        return sanity

    def _check_remote_image_mime_type(self, filename):
        sanity = self._check_image_mime_type(filename)
        return sanity

    def _check_image_mime_type(self, filename):
        """Checks mime type guessed by filename."""
        mimetype = mimetypes.guess_type(filename, strict=False)[0]
        if mimetype.startswith("image/"):
            return True
        return False

    def _check_local_feed_bad_ends(self, feedPath):
        """Checks for a feed given by path to be a html file."""

        bad_heads = ['<html>', '<?php', '#include']
        bad_tails = ['</html>', '?>', '}']

        with open(feedPath, 'rb') as feed:
            feed = feed.read()
            try:
                feed = feed.decode('utf-8')
            except UnicodeDecodeError:
                try:
                    feed = feed.decode('latin-1')
                except UnicodeDecodeError:
                    raise
            feed = ' '.join(feed.split())

        for head in bad_heads:
            if feed.startswith(head):
                return True
        for tail in bad_tails:
            if feed.startswith(tail):
                return True
        return False

    def _check_remote_existence(self, url):
        """
        Checks whether there exists a remote resource at the given path.

        >>> from ResourceChecker import ResourceChecker
        >>> resource_checker = ResourceChecker()
        >>> url = 'http://www.example.com/fakepath'
        >>> resource_checker._check_remote_existence(url)
        True
        >>> url = "http://www.asdfaljhfajefjksafbnlrnvlksvs.com/"
        >>> resource_checker._check_remote_existence(url)
        False
        """

        (netloc, path) = urlparse.urlparse(url)[1:3]
        conn = httplib.HTTPConnection(netloc)
        try:
            conn.request('HEAD', path)
        except socket.gaierror:
            return False  # : [Errno -2] Name or service not known
        try:
            response = conn.getresponse()
        except AttributeError:
            return False  # : 'NoneType' object has no attribute 'makefile'
        conn.close()
        return response.status in (200, 301, 302)
Exemplo n.º 10
0
class FeedParserCli:
    
    _logger = LoggerFactory().getLogger('FeedParserCli')
    
    # http://packages.python.org/feedparser/reference.html
    
    fields = {
    'bozo':False,
    'bozo_exception':False,
    'encoding':False,
    'entries':{},
#===============================================================================
# entries[i].author
# entries[i].author_detail
# entries[i].author_detail.name
# entries[i].author_detail.href
# entries[i].author_detail.email
# entries[i].comments
# entries[i].content
# entries[i].content[j].value
# entries[i].content[j].type
# entries[i].content[j].language
# entries[i].content[j].base
# entries[i].contributors
# entries[i].contributors[j].name
# entries[i].contributors[j].href
# entries[i].contributors[j].email
# entries[i].created
# entries[i].created_parsed
# entries[i].enclosures
# entries[i].enclosures[j].href
# entries[i].enclosures[j].length
# entries[i].enclosures[j].type
# entries[i].expired
# entries[i].expired_parsed
# entries[i].id
# entries[i].license
# entries[i].link
# entries[i].links
# entries[i].links[j].rel
# entries[i].links[j].type
# entries[i].links[j].href
# entries[i].links[j].title
# entries[i].published
# entries[i].published_parsed
# entries[i].publisher
# entries[i].publisher_detail
# entries[i].publisher_detail.name
# entries[i].publisher_detail.href
# entries[i].publisher_detail.email
# entries[i].source
# entries[i].source.author
# entries[i].source.author_detail
# entries[i].source.contributors
# entries[i].source.icon
# entries[i].source.id
# entries[i].source.link
# entries[i].source.links
# entries[i].source.logo
# entries[i].source.rights
# entries[i].source.rights_detail
# entries[i].source.subtitle
# entries[i].source.subtitle_detail
# entries[i].source.title
# entries[i].source.title_detail
# entries[i].source.updated
# entries[i].source.updated_parsed
# entries[i].summary
# entries[i].summary_detail
# entries[i].summary_detail.value
# entries[i].summary_detail.type
# entries[i].summary_detail.language
# entries[i].summary_detail.base
# entries[i].tags
# entries[i].tags[j].term
# entries[i].tags[j].scheme
# entries[i].tags[j].label
# entries[i].title
# entries[i].title_detail
# entries[i].title_detail.value
# entries[i].title_detail.type
# entries[i].title_detail.language
# entries[i].title_detail.base
# entries[i].updated
# entries[i].updated_parsed
# entries[i].vcard
# entries[i].xfn
# entries[i].xfn[j].relationships
# entries[i].xfn[j].href
# entries[i].xfn[j].name
#===============================================================================
    'etag':False,
    'feed':{
        'author':False,
        'author_detail':{
             'name', 'href', 'email'
        },
        'cloud': {
              'domain':False, 'port':False, 'path':False, 'registerProcedure':False, 'protocol':False,
              'contributors': {
                  'name', 'href', 'email', 'docs', 'errorreportsto'
              }
        },
        'generator':False,
        'generator_detail': {
             'name',
             'href',
             'version',
         },
        'icon':False,
        'id':False,
        'image': {
            'href', 'link', 'width', 'height', 'description'
        }
    }
#===============================================================================
# feed.info
# feed.info_detail
# feed.info_detail.value
# feed.info_detail.type
# feed.info_detail.language
# feed.info_detail.base
# feed.language
# feed.license
# feed.link
# feed.links
# feed.links[i].rel
# feed.links[i].type
# feed.links[i].href
# feed.links[i].title
# feed.logo
# feed.published
# feed.published_parsed
# feed.publisher
# feed.publisher_detail
# feed.publisher_detail.name
# feed.publisher_detail.href
# feed.publisher_detail.email
# feed.rights
# feed.rights_detail
# feed.rights_detail.value
# feed.rights_detail.type
# feed.rights_detail.language
# feed.rights_detail.base
# feed.subtitle
# feed.subtitle_detail
# feed.subtitle_detail.value
# feed.subtitle_detail.type
# feed.subtitle_detail.language
# feed.subtitle_detail.base
# feed.tags
# feed.tags[i].term
# feed.tags[i].scheme
# feed.tags[i].label
# feed.textinput
# feed.textinput.title
# feed.textinput.link
# feed.textinput.name
# feed.textinput.description
# feed.title
# feed.title_detail
# feed.title_detail.value
# feed.title_detail.type
# feed.title_detail.language
# feed.title_detail.base
# feed.ttl
# feed.updated
# feed.updated_parsed
# headers
# href
# modified
# namespaces
# status
# version
#===============================================================================
}
    
    dynamicFields = ['author',
                     'contributors',
                     'docs',
                     'errorreportsto',
                     'generator',
                     'title',
                     'description',
                     'image',
                     'language',
                     'link',
                     'summary',
                     'itunes:explicit',
                     'itunes:subtitle',
                     'itunes:summary',
                     'itunes:subtitle',
                     'updated',
                     'updated_parsed',
                     'headers',
                     'href',
                     'modified',
                     'namespaces',
                     'status',
                     'version',
                     'encoding',
                     'etag'
                     ]
    
    def __init__(self):
        pass
    
    def run(self):
        
        while True:
            feedPath = sys.stdin.readline()
            if not feedPath:
                break
            feed = self._parseFeed(feedPath)

            #feedDict = self.createFeedDict(feed)
            feedDict = self.createFeedDictRecursive(feed, None, self.fields)

            out = json.dumps(feedDict)
            sys.stdout.write(out)
            sys.stdout.flush()

    def _parseFeed (self, feedPath):      
        try:    
            #feed = speedparser.parse(feedPath)['feed']
            feed = feedparser.parse(feedPath)
            try:
                feed = feed['feed']
            except (KeyError, TypeError):
                return False

        except xml.sax._exceptions.SAXException:
            sys.stderr.write("Aborted.")
            sys.stderr.flush()
            return False
        
        return feed
    
    def createFeedDictRecursive (self, feed, root, fields):
        feedDict = {}
        self._logger.debug('root0: ' + str(root))
        if root:
            fields = fields[root]

        for fieldKey in fields:
            try:
                fields = self.fields[fieldKey] # 'encoding':False
                root += fieldKey
                self._logger.debug('root1: ' + str(root))
                feedDict += self.createFeedDictRecursive(feed, root, fields)
            except TypeError:
                try:
                    # leaf
                    feedDict += feed[root]
                except KeyError:
                    self._logger.debug('no such field ' + str(root))
                    pass
                
                if root:
                    feedDict[root] += feed[root]

        self._logger.debug(feedDict)
        return feedDict

    def createFeedDict (self, feed):
        
        feedDict = {}
        for fieldKey in self.dynamicFields:
            try:
                fieldValue = feed[fieldKey]
                try:
                    fieldKey = fieldKey + '_s' # lets assume all fields are dynamic
                    fieldValue = BeautifulSoup(fieldValue).get_text()
                    fieldValue = ' '.join(fieldValue.split())
                    feedDict[fieldKey] = fieldValue
                except TypeError:
                    fieldValue = ""
            except (KeyError, TypeError):
                pass
        if feedDict == {}:
            return
        return feedDict