def try_down_version(url): print(url) html = requests.get(url).text result = re.findall(r"https://chrome.google.com/webstore/detail/(.*)\?utm", html) if len(result) < 1: print(bad('no extensions at this url')) return ext_id = re.findall(r"https://chrome.google.com/webstore/detail/(.*)\?utm", html)[0] print('ext_id', ext_id) soup = BeautifulSoup(html, "lxml") link = soup.find('a', {'rel': 'nofollow'}) href = link.attrs['href'] print('storage url:', href) print(link.text, href) tmp_file = TMP_FILE.format(ext_id=ext_id) try: urllib.request.urlretrieve(href, tmp_file) except Exception as e: print(bad('fail to download crx:'), e) return manifest = None try: manifest = extract_manifest_of_file(tmp_file) except Exception as e: print(bad('bad download, parse of manifest failed'), e) target_dir_path = DEST_DIR.format(ext_id=ext_id) if manifest and 'version' in manifest: version = manifest['version'] print(good('manifest version:'), version) # current_version represent version_name so it can be different than the version stored target_file_path = DEST_FILE.format(dir=target_dir_path, version=version) if os.path.isfile(target_file_path): print(bad("file is already here, here's the version_name"), manifest.get('version_name'), 'and .version=', manifest.get('version')) os.remove(tmp_file) return ok(good('version is added :D'), manifest['version'], url) # assert current_version == version_name or version os.makedirs(target_dir_path, exist_ok=True) shutil.move(tmp_file, target_file_path) else: try: os.remove(tmp_file) except OSError: pass except FileNotFoundError: pass
def do(url): ext_id = url.split('/')[-1] if SPECIFIC_EXT and ext_id != SPECIFIC_EXT: return print() print(ext_id) tmp_file = TMP_FILE.format(ext_id=ext_id) latest = latest_stored(ext_id) if latest and is_404(latest): return if latest: print(latest.get('name')) if latest and latest['diff'].days > -2 and 'content' in latest: print('using latest stored') infos = latest['content'] else: # get current version if url is None: url = "https://chrome.google.com/webstore/detail/_/" + ext_id try: req = requests.get(url) if req.status_code != 200: ok(bad('bad status code:'), req.status_code) return page_html = req.text except Exception as e: ok(bad('fail to download page: ' + url), e) return try: infos = parse_page(page_html) except Exception as e: ok(bad('ERROR: bad parsing for ' + url), e) infos = {} if not is_stored_recent(ext_id) and 'version' in infos: store_infos_history(ext_id, infos) print('saved it :D') if 'version' not in infos: ok(bad('ERROR: no infos for ' + url)) return current_version = infos['version'] print('current_version:', current_version) target_dir_path = DEST_DIR.format(ext_id=ext_id) target_file_path = DEST_FILE.format(dir=target_dir_path, version=current_version) # download extension if not os.path.isfile(target_file_path): # ^^^ caveat, the guy can have the same version name displayed but different versions try: down(ext_id, tmp_file) except Exception as e: ok(bad('fail to download crx:'), e) return manifest = None try: manifest = extract_manifest_of_file(tmp_file) except Exception as e: ok(bad('bad download, parse of manifest failed'), e) if manifest and 'version' in manifest: version = manifest['version'] print(good('manifest version:'), version) # current_version represent version_name so it can be different than the version stored target_file_path = DEST_FILE.format(dir=target_dir_path, version=version) if os.path.isfile(target_file_path): print(bad("file is already here, here's the version_name"), manifest.get('version_name'), 'and .version=', manifest.get('version')) os.remove(tmp_file) return ok(good('version is added :D'), manifest['version'], url) # assert current_version == version_name or version os.makedirs(target_dir_path, exist_ok=True) shutil.move(tmp_file, target_file_path) else: try: os.remove(tmp_file) except OSError: pass except FileNotFoundError: pass else: print('latest version already downloaded')
def do(url): ext_id = url.split('/')[-1] if SPECIFIC_EXT and ext_id != SPECIFIC_EXT: return print() print(ext_id) tmp_file = TMP_FILE.format(ext_id=ext_id) latest = latest_stored(ext_id) if latest and is_404(latest): return if latest: print(latest.get('name')) if latest and latest['diff'].days > -2 and 'content' in latest: print('using latest stored') infos = latest['content'] else: # get current version if url is None: url = "https://chrome.google.com/webstore/detail/_/"+ext_id try: req = requests.get(url) if req.status_code != 200: print(bad('bad status code:'), req.status_code) return page_html = req.text except Exception as e: print(bad('fail to download page: '+url), e) return infos = parse_page(page_html) if not is_stored_recent(ext_id): store_infos_history(ext_id, infos) print('saved it :D') current_version = infos['version'] print('current_version:', current_version) target_dir_path = DEST_DIR.format(ext_id=ext_id) target_file_path = DEST_FILE.format(dir=target_dir_path, version=current_version) # download extension if not os.path.isfile(target_file_path): # ^^^ caveat, the guy can have the same version name displayed but different versions try: down(ext_id, tmp_file) except Exception as e: print(bad('fail to download crx:'), e) return manifest = None try: manifest = extract_manifest_of_file(tmp_file) except Exception as e: print(bad('bad download, parse of manifest failed'), e) if manifest and 'version' in manifest: version = manifest['version'] print(good('manifest version:'), version) # current_version represent version_name so it can be different than the version stored target_file_path = DEST_FILE.format(dir=target_dir_path, version=version) if os.path.isfile(target_file_path): print(bad("file is already here, here's the version_name"), manifest.get('version_name'), 'and .version=', manifest.get('version')) os.remove(tmp_file) return ok(good('version is added :D'), manifest['version'], url) # assert current_version == version_name or version os.makedirs(target_dir_path, exist_ok=True) shutil.move(tmp_file, target_file_path) else: try: os.remove(tmp_file) except OSError: pass except FileNotFoundError: pass else: print('latest version already downloaded')
import os, shutil from tqdm import tqdm from extstats.parse_infos import extract_manifest_of_file SOURCE_DIR = 'crawled/crx4chrome/' DEST_DIR = 'crawled/crx/' for ext_id in os.listdir(SOURCE_DIR): dir_fullpath = SOURCE_DIR+ext_id for version_file in os.listdir(dir_fullpath): version_path = dir_fullpath+'/'+version_file data = extract_manifest_of_file(version_path) if data: version = data['version'] dest_dir = DEST_DIR + ext_id + '/' destpath = dest_dir + version + '.zip' if not os.path.exists(destpath): print('adding', ext_id, version) os.makedirs(dest_dir, exist_ok=True) shutil.copy(version_path, destpath)
import os, shutil from tqdm import tqdm from extstats.parse_infos import extract_manifest_of_file SOURCE_DIR = '/media/rob/backup/all_crx/crx/' DEST_DIR = '/media/rob/backup/exts/' for file in tqdm(os.listdir(SOURCE_DIR)): ext_id = file.replace('.crx', '') fullpath = SOURCE_DIR + file data = extract_manifest_of_file(fullpath) if data: version = data['version'] dest_dir = DEST_DIR + ext_id + '/' destpath = dest_dir + version + '.zip' if os.path.exists(destpath) and os.path.isdir(destpath): shutil.rmtree(destpath) if not os.path.exists(destpath): os.makedirs(dest_dir, exist_ok=True) shutil.copy(fullpath, destpath)
import os, shutil from tqdm import tqdm from extstats.parse_infos import extract_manifest_of_file SOURCE_DIR = 'crawled/crx4chrome/' DEST_DIR = 'crawled/crx/' for ext_id in os.listdir(SOURCE_DIR): dir_fullpath = SOURCE_DIR + ext_id for version_file in os.listdir(dir_fullpath): version_path = dir_fullpath + '/' + version_file data = extract_manifest_of_file(version_path) if data: version = data['version'] dest_dir = DEST_DIR + ext_id + '/' destpath = dest_dir + version + '.zip' if not os.path.exists(destpath): print('adding', ext_id, version) os.makedirs(dest_dir, exist_ok=True) shutil.copy(version_path, destpath)
import os, shutil from tqdm import tqdm from extstats.parse_infos import extract_manifest_of_file SOURCE_DIR = '/media/rob/backup/all_crx/crx/' DEST_DIR = '/media/rob/backup/exts/' for file in tqdm(os.listdir(SOURCE_DIR)): ext_id = file.replace('.crx', '') fullpath = SOURCE_DIR+file data = extract_manifest_of_file(fullpath) if data: version = data['version'] dest_dir = DEST_DIR + ext_id + '/' destpath = dest_dir + version + '.zip' if os.path.exists(destpath) and os.path.isdir(destpath): shutil.rmtree(destpath) if not os.path.exists(destpath): os.makedirs(dest_dir, exist_ok=True) shutil.copy(fullpath, destpath)