示例#1
0
文件: ljm.py 项目: isteroid/ljparser
def compare2versions():
    ljname = 'evo-lutio'
    filename1 = '{0}/calendar/_{0}_.txt'.format(ljname)
    filename2 = '{0}/calendar/_compare_.txt'.format(ljname)
    f1 = set(lj.loadfromfile(filename1, lines=True))
    f2 = set(lj.loadfromfile(filename2, lines=True))
    print('now: {}'.format(len(f1)))
    print('bef: {}'.format(len(f2)))
    print('now-bef: {}'.format(len(f1 - f2)))
    print('bef-now: {}'.format(len(f2 - f2)))
    print('now-bef')
    print(list(f1 - f2))
    print('bef-now')
    print(list(f2 - f1))
示例#2
0
文件: ljm.py 项目: isteroid/ljparser
def uniqfilelines(filename, sort=False):
    lst = lj.loadfromfile(filename, True)
    lst = list(set(lst))
    lst = [l for l in lst if l]
    if sort:
        lst.sort(reverse=True, key=lambda url: int(lj.extractpostid(url)))
    lj.savetofile('\n'.join(lst), filename + '_u_.txt')
示例#3
0
def parseljpost(url, forced: bool):
	postid = lj.extractpostid(url)
	ljname = lj.extractljname(url)
	filename = '{}/raw/{}.raw'.format(ljname, postid)
	if not forced and lj.checkfileexist('{}/data/{}.htm.json'.format(ljname, postid)):
		return
	if lj.checkfileexist(filename):
		soup = BeautifulSoup(lj.loadfromfile(filename), features='html.parser')
		jsondata = json.loads(lj.loadfromfile(filename + '.json'))
		jsondata['entry']['tags'] = gettags(soup)
		jsondata['entry']['ljname'] = ljname
		article = cleanarticle(soup.find('article', class_='entry-content'))
		lj.savetofile(article.prettify(), '{}/data/{}.htm'.format(ljname, postid))
		article = loadimagesa(ljname, postid, article)
		jsondata['comments'] = loadimagesc(ljname, postid, jsondata['comments'])
		jsondata['comments'] = loaduserpics(ljname, postid, jsondata['comments'])
		lj.savetofile(jsondata, '{}/data/{}.htm.json'.format(ljname, postid))
		lj.savetofile(article.prettify(), '{}/data/{}.htm'.format(ljname, postid))
示例#4
0
def makeupdate(ljname: str, count: int):
	if not count:
		return
	lj.title('make web update')
	dir = lambda ljname, dir: '{ljname}/{dir}'.format(ljname=ljname, dir=dir)
	udir = lambda ljname, dir: '{ljname}/_update_/{dir}'.format(ljname=ljname, dir=dir if ljname not in dir else dir.repl(ljname, ljname+'/_update_', 1))
	# lj.checkpath(udir(ljname,'data/'))
	# lj.checkpath(udir(ljname,'images/userpics/'))
	# lj.checkpath(udir(ljname,'images/comments/'))
	posts = lj.loadfromfile('{0}/calendar/_{0}_.txt'.format(ljname), True)[:count+1]
	files = list()
	upics = list()
	ids = list()
	for post in posts:
		postid = lj.extractpostid(post)
		ids.append(postid)
		jsondata = json.loads(lj.loadfromfile('{dir}/{postid}.htm.json'.format(dir=dir(ljname, 'data'), postid=postid)))
		for comment in jsondata['comments']:
			filename = comment.get('userpic')
			if filename:
				filename = filename.split('/')[-1]
				if filename not in upics:
					upics.append(filename)
		files += [postid + '.htm', postid + '.htm.json']
	print(len(upics))
	imgext = ['.jpg', '.png', '.gif', '.svg']
	images = [file for file in os.listdir(dir(ljname, 'images/')) if file.endswith(tuple(imgext))]
	images = [file for file in images if file.split('-', 1)[0] in ids]
	cimages = [file for file in os.listdir(dir(ljname, 'images/comments/')) if file.endswith(tuple(imgext))]
	cimages = [file for file in cimages if file.split('-', 1)[0] in ids]
	# copyallfiles(files, dir(ljname, 'data/'), udir(ljname, 'data/'), True)
	# copyallfiles(images, dir(ljname, 'images/'), udir(ljname, 'images/'), False)
	# copyallfiles(upics, dir(ljname, 'images/userpics/'), udir(ljname, 'images/userpics/'), False)
	# print('creating update is complete!')
	lj.title('uploading data to ftp...')
	ftptransfer(files,  dir(ljname, 'data/'), 'data/', True)
	lj.title('uploading images to ftp...')
	ftptransfer(images, dir(ljname, 'images/'), 'images/', False)
	lj.title('uploading userpics to ftp...')
	ftptransfer(cimages, dir(ljname, 'images/comments/'), 'images/comments/', False)
	lj.title('uploading comment images to ftp...')
	ftptransfer(upics,  dir(ljname, 'images/userpics/'), 'images/userpics/', False)
	ftptransfer(['_{}_.txt'.format(ljname)], dir(ljname, 'calendar/'), 'calendar/', True)
	print('uploading is complete!')
示例#5
0
def processing(ljname: str, skip=None or int, maxcount=None or int, forced=False):
	lj.title('processing raw -- to --> data')
	filename = '{0}/calendar/_{0}_.txt'.format(ljname)
	if lj.checkfileexist(filename):
		posts = lj.loadfromfile(filename, lines=True)
		for post in posts[skip:maxcount]:
			print(post)
			url = post.split(' ', 1)[0]
			parseljpost(url, forced)
		print('-'*40)
		print(len(posts))
示例#6
0
def testdata(ljname: str):
	lj.title('testing')
	dir = '{}/raw/'.format(ljname)
	posts = lj.loadfromfile('{0}/calendar/_{0}_.txt'.format(ljname), True)
	postsids = list(map(lj.extractpostid, posts))
	for file in os.listdir(dir):
		if file.endswith('.json'):
			postid = int(file.split('.',1)[0])
			filename = dir + file
			jsondata = json.loads(lj.loadfromfile(filename))
			nav = jsondata['entry']['nav']
			if len(nav['prev']) < 5 or len(nav['next']) < 5:
				print('{} * {} * {}'.format(nav['prev'], postid, nav['next']))
				if str(postid) in postsids:
					nav = lj.getljnavigation(posts, postsids.index(str(postid)))
					if len(nav['next']) > 5 or len(nav['prev']) > 5:
						jsondata['entry']['nav'] = nav
						lj.savetofile(jsondata, filename, 'w')
						print('Навигация успешно исправлена: {} {} {}'.format(postid, nav['prev'], nav['next']))
					else:
						print('Не удалось исправить навигацию: {}'.format(postid))
				else:
					print('Пост {} отсутствует в списке постов. Обновите список постов!'.format(postid))
示例#7
0
文件: ljm.py 项目: isteroid/ljparser
def makehtml(ljname: str,
             skip=None,
             maxcount=None,
             mkposts=True,
             mkindex=True):
    # генерирует html контент на основе обработанных данных
    lj.title('generate posts .htmls')
    filename = '{0}/calendar/_{0}_.txt'.format(ljname)
    lj.savetofile('', '{0}/calendar/_collapsed_.txt'.format(ljname),
                  'w')  # clear comment errors file
    if lj.checkfileexist(filename):
        posts = lj.loadfromfile(filename, lines=True)
        if mkposts:
            for post in posts[skip:maxcount]:
                print(post)
                url = post.split(' ', 1)[0]
                makepost(url)
        if mkindex:
            makeindex(ljname, posts)
    uniqfilelines('{0}/calendar/_collapsed_.txt'.format(ljname), sort=True)
示例#8
0
文件: ljm.py 项目: isteroid/ljparser
def makeindex(ljname, posts):
    lj.title('generate html index')
    csstime = int(time.time())
    template = lj.loadfromfile('_index_templ.html').split('~')
    templateblocks = {
        'start': template[0],
        'ok': template[1],
        'none': template[2],
        'end': template[3]
    }
    fields = {'ljname': ljname, 'time': csstime}
    html = templateblocks['start'].format(**fields)
    for item in posts:
        post = item.split(' ', 1)
        url = post[0]
        title = post[1]
        postid = lj.extractpostid(url)
        block = 'ok' if lj.checkfileexist('{}/post/{}.html'.format(
            ljname, postid)) else 'none'
        html += templateblocks[block].format(postid=postid, title=title)
    html += templateblocks['end']
    lj.savetofile(html, '{}/index.html'.format(ljname))
示例#9
0
文件: ljm.py 项目: isteroid/ljparser
def makepost(url):
    csstime = int(time.time())
    postid = lj.extractpostid(url)
    ljname = lj.extractljname(url)
    filename = '{}/data/{}.htm'.format(ljname, postid)
    if lj.checkfileexist(filename):
        article = BeautifulSoup(lj.loadfromfile(filename),
                                features='html.parser')
        jsondata = json.loads(lj.loadfromfile(filename + '.json'))
        template = lj.loadfromfile('_post_templ.html').split('~')
        templateblocks = {
            'ulopen': template[2],
            'comment': template[3],
            'ulclose': template[4]
        }
        entry = jsondata['entry']
        nav = entry['nav']
        next = lj.extractpostid(nav['next'])
        prev = lj.extractpostid(nav['prev'])
        fields = {
            'title': entry['title'],
            'article': article,
            'prev': prev,
            'prevtitle': nav['prevtitle'],
            'next': next,
            'nexttitle': nav['nexttitle'],
            'postid': entry['ditemid'],
            'ljname': entry['ljname'],
            'time': csstime
        }
        html = template[0].format(**fields)
        comments = jsondata['comments']
        cids = dict()
        currentlevel = 0
        for comment in comments:
            parent = comment.get('parent', None)
            dtalkid = comment.get('dtalkid')
            if dtalkid:
                dtalkid = str(dtalkid)
                if not parent:
                    comment['level'] = 0
                else:
                    parent = str(parent)
                    comment['level'] = cids[parent] + 1
                cids[dtalkid] = comment['level']
                cssclass = 'author' if comment['uname'] == jsondata['entry'][
                    'journal'] else ''
                cfields = {
                    'level': comment['level'],
                    'userpic': comment.get('userpic'),
                    'user': comment['uname'],
                    'article': comment.get('article'),
                    'class': cssclass,
                    'userlj': comment.get('commenter_journal_base')
                }
                delta = comment['level'] - currentlevel
                if delta:
                    tag = templateblocks[
                        'ulopen'] if delta > 0 else templateblocks['ulclose']
                    html += tag * abs(delta)
                html += templateblocks['comment'].format(**cfields)
                currentlevel = comment['level']
            else:
                print('Читать комменты: ' + url)
                lj.savetofile('{} {}\n'.format(url, entry['title']),
                              '{0}/calendar/_collapsed_.txt'.format(ljname),
                              'a')  # add comment error to file
        html += template[1].format(**fields)
        html = html.replace('href="https://{}.livejournal.com/'.format(ljname),
                            'href="')
        html = html.replace('href="http://{}.livejournal.com/'.format(ljname),
                            'href="')
        lj.savetofile(html, '{}/post/{}.html'.format(ljname, postid))