def prepare_es_dto(obj): #obj['similar'] = [] #obj['seen'] = [] obj['main'] = False if not 'fulltext' in obj: obj['fulltext'] = '' if not 'date_iso' in obj and isinstance(obj['date'], datetime): obj['date_iso'] = obj['date'].isoformat() elif 'date_iso' in obj and isinstance(obj['date_iso'], datetime): obj['date_iso'] = obj['date_iso'].isoformat() if not 'seen' in obj: obj['seen'] = [] elif isinstance(obj['seen'], str): obj['seen'] = [] elif obj['seen'] is None: obj['seen'] = [] if not 'similar' in obj: obj['similar'] = [] elif isinstance(obj['similar'], str): obj['similar'] = [] elif obj['similar'] is None: obj['similar'] = [] obj['similar'] = [] obj['fulltext'] = clean(obj['fulltext']) obj['content'] = clean(obj['content']) obj['title'] = clean(obj['title']) return obj
def get_instance(cls, dictArticle, source): a = None try: if not dictArticle.description: return content = lxml.html.fromstring(dictArticle.description).text_content() hash_str = ':'.join([dictArticle.title, content, source])\ .encode('ascii', 'ignore') hash = md5_constructor(hash_str).hexdigest() article_date = dictArticle.published_parsed if not article_date: article_date = datetime.now().isoformat() else: article_date = datetime.fromtimestamp( mktime(dictArticle.published_parsed) ).isoformat() a, created = cls.objects.get_or_create(link=dictArticle.link) if created: article = { 'title': utils.clean(dictArticle.title), 'link': dictArticle.link, 'hash_key': hash, 'content': utils.clean(content), 'source': source, 'tag': cls.__name__, 'image_url': get_image_url(dictArticle.links), 'date': '%s' % article_date } a.title = article['title'] a.hash_key = article['hash_key'] a.content = article['content'] a.source = article['source'] a.date = article['date'] a.image_url = article['image_url'] a.save() return article return None except: utils.print_exception() return None