def try_down_version(url):
    print(url)
    html = requests.get(url).text
    result = re.findall(r"https://chrome.google.com/webstore/detail/(.*)\?utm",
                        html)
    if len(result) < 1:
        print(bad('no extensions at this url'))
        return
    ext_id = re.findall(r"https://chrome.google.com/webstore/detail/(.*)\?utm",
                        html)[0]
    print('ext_id', ext_id)

    soup = BeautifulSoup(html, "lxml")
    link = soup.find('a', {'rel': 'nofollow'})
    href = link.attrs['href']
    print('storage url:', href)
    print(link.text, href)

    tmp_file = TMP_FILE.format(ext_id=ext_id)

    try:
        urllib.request.urlretrieve(href, tmp_file)
    except Exception as e:
        print(bad('fail to download crx:'), e)
        return

    manifest = None
    try:
        manifest = extract_manifest_of_file(tmp_file)
    except Exception as e:
        print(bad('bad download, parse of manifest failed'), e)

    target_dir_path = DEST_DIR.format(ext_id=ext_id)

    if manifest and 'version' in manifest:
        version = manifest['version']
        print(good('manifest version:'), version)
        # current_version represent version_name so it can be different than the version stored
        target_file_path = DEST_FILE.format(dir=target_dir_path,
                                            version=version)
        if os.path.isfile(target_file_path):
            print(bad("file is already here, here's the version_name"),
                  manifest.get('version_name'), 'and .version=',
                  manifest.get('version'))
            os.remove(tmp_file)
            return
        ok(good('version is added :D'), manifest['version'], url)
        # assert current_version == version_name or version
        os.makedirs(target_dir_path, exist_ok=True)
        shutil.move(tmp_file, target_file_path)
    else:
        try:
            os.remove(tmp_file)
        except OSError:
            pass
        except FileNotFoundError:
            pass
def try_down_version(url):
    print(url)
    html = requests.get(url).text
    result = re.findall(r"https://chrome.google.com/webstore/detail/(.*)\?utm", html)
    if len(result) < 1:
        print(bad('no extensions at this url'))
        return
    ext_id = re.findall(r"https://chrome.google.com/webstore/detail/(.*)\?utm", html)[0]
    print('ext_id', ext_id)

    soup = BeautifulSoup(html, "lxml")
    link = soup.find('a', {'rel': 'nofollow'})
    href = link.attrs['href']
    print('storage url:', href)
    print(link.text, href)

    tmp_file = TMP_FILE.format(ext_id=ext_id)

    try:
        urllib.request.urlretrieve(href, tmp_file)
    except Exception as e:
        print(bad('fail to download crx:'), e)
        return

    manifest = None
    try:
        manifest = extract_manifest_of_file(tmp_file)
    except Exception as e:
        print(bad('bad download, parse of manifest failed'), e) 

    target_dir_path = DEST_DIR.format(ext_id=ext_id)

    if manifest and 'version' in manifest:
        version = manifest['version']
        print(good('manifest version:'), version)
        # current_version represent version_name so it can be different than the version stored
        target_file_path = DEST_FILE.format(dir=target_dir_path, version=version)
        if os.path.isfile(target_file_path):
            print(bad("file is already here, here's the version_name"),
                manifest.get('version_name'),
                'and .version=', manifest.get('version'))
            os.remove(tmp_file)
            return
        ok(good('version is added :D'), manifest['version'], url)
        # assert current_version == version_name or version
        os.makedirs(target_dir_path, exist_ok=True)
        shutil.move(tmp_file, target_file_path)
    else:
        try:
            os.remove(tmp_file)
        except OSError:
            pass
        except FileNotFoundError:
            pass
def do(url):
    ext_id = url.split('/')[-1]
    if SPECIFIC_EXT and ext_id != SPECIFIC_EXT:
        return
    print()
    print(ext_id)
    tmp_file = TMP_FILE.format(ext_id=ext_id)

    latest = latest_stored(ext_id)
    if latest and is_404(latest):
        return
    if latest:
        print(latest.get('name'))
    if latest and latest['diff'].days > -2 and 'content' in latest:
        print('using latest stored')
        infos = latest['content']
    else:
        # get current version
        if url is None:
            url = "https://chrome.google.com/webstore/detail/_/" + ext_id
        try:
            req = requests.get(url)
            if req.status_code != 200:
                ok(bad('bad status code:'), req.status_code)
                return
            page_html = req.text
        except Exception as e:
            ok(bad('fail to download page: ' + url), e)
            return
        try:
            infos = parse_page(page_html)
        except Exception as e:
            ok(bad('ERROR: bad parsing for ' + url), e)
            infos = {}
        if not is_stored_recent(ext_id) and 'version' in infos:
            store_infos_history(ext_id, infos)
            print('saved it :D')
    if 'version' not in infos:
        ok(bad('ERROR: no infos for ' + url))
        return
    current_version = infos['version']
    print('current_version:', current_version)

    target_dir_path = DEST_DIR.format(ext_id=ext_id)
    target_file_path = DEST_FILE.format(dir=target_dir_path,
                                        version=current_version)

    # download extension
    if not os.path.isfile(target_file_path):
        # ^^^ caveat, the guy can have the same version name displayed but different versions
        try:
            down(ext_id, tmp_file)
        except Exception as e:
            ok(bad('fail to download crx:'), e)
            return

        manifest = None
        try:
            manifest = extract_manifest_of_file(tmp_file)
        except Exception as e:
            ok(bad('bad download, parse of manifest failed'), e)

        if manifest and 'version' in manifest:
            version = manifest['version']
            print(good('manifest version:'), version)
            # current_version represent version_name so it can be different than the version stored
            target_file_path = DEST_FILE.format(dir=target_dir_path,
                                                version=version)
            if os.path.isfile(target_file_path):
                print(bad("file is already here, here's the version_name"),
                      manifest.get('version_name'), 'and .version=',
                      manifest.get('version'))
                os.remove(tmp_file)
                return
            ok(good('version is added :D'), manifest['version'], url)
            # assert current_version == version_name or version
            os.makedirs(target_dir_path, exist_ok=True)
            shutil.move(tmp_file, target_file_path)
        else:
            try:
                os.remove(tmp_file)
            except OSError:
                pass
            except FileNotFoundError:
                pass
    else:
        print('latest version already downloaded')
def do(url):
    ext_id = url.split('/')[-1]
    if SPECIFIC_EXT and ext_id != SPECIFIC_EXT:
        return
    print()
    print(ext_id)
    tmp_file = TMP_FILE.format(ext_id=ext_id)

    latest = latest_stored(ext_id)
    if latest and is_404(latest):
        return
    if latest:
        print(latest.get('name'))
    if latest and latest['diff'].days > -2 and 'content' in latest:
        print('using latest stored')
        infos = latest['content']
    else:
        # get current version
        if url is None:
            url = "https://chrome.google.com/webstore/detail/_/"+ext_id
        try:
            req = requests.get(url)
            if req.status_code != 200:
                print(bad('bad status code:'), req.status_code)
                return
            page_html = req.text
        except Exception as e:
            print(bad('fail to download page: '+url), e)
            return
        infos = parse_page(page_html)
        if not is_stored_recent(ext_id):
            store_infos_history(ext_id, infos)
            print('saved it :D')
    current_version = infos['version']
    print('current_version:', current_version)

    target_dir_path = DEST_DIR.format(ext_id=ext_id)
    target_file_path = DEST_FILE.format(dir=target_dir_path, version=current_version)

    # download extension
    if not os.path.isfile(target_file_path):
        # ^^^ caveat, the guy can have the same version name displayed but different versions
        try:
            down(ext_id, tmp_file)
        except Exception as e:
            print(bad('fail to download crx:'), e)
            return
        
        manifest = None
        try:
            manifest = extract_manifest_of_file(tmp_file)
        except Exception as e:
            print(bad('bad download, parse of manifest failed'), e) 
    
        if manifest and 'version' in manifest:
            version = manifest['version']
            print(good('manifest version:'), version)
            # current_version represent version_name so it can be different than the version stored
            target_file_path = DEST_FILE.format(dir=target_dir_path, version=version)
            if os.path.isfile(target_file_path):
                print(bad("file is already here, here's the version_name"),
                    manifest.get('version_name'),
                    'and .version=', manifest.get('version'))
                os.remove(tmp_file)
                return
            ok(good('version is added :D'), manifest['version'], url)
            # assert current_version == version_name or version
            os.makedirs(target_dir_path, exist_ok=True)
            shutil.move(tmp_file, target_file_path)
        else:
            try:
                os.remove(tmp_file)
            except OSError:
                pass
            except FileNotFoundError:
                pass
    else:
        print('latest version already downloaded')
import os, shutil
from tqdm import tqdm

from extstats.parse_infos import extract_manifest_of_file

SOURCE_DIR = 'crawled/crx4chrome/'
DEST_DIR = 'crawled/crx/'

for ext_id in os.listdir(SOURCE_DIR):
    dir_fullpath = SOURCE_DIR+ext_id
    for version_file in os.listdir(dir_fullpath):
        version_path = dir_fullpath+'/'+version_file
        data = extract_manifest_of_file(version_path)
        if data:
            version = data['version']
            dest_dir = DEST_DIR + ext_id + '/'
            destpath = dest_dir + version + '.zip'
            if not os.path.exists(destpath):
                print('adding', ext_id, version)
                os.makedirs(dest_dir, exist_ok=True)
                shutil.copy(version_path, destpath)
Exemplo n.º 6
0
import os, shutil
from tqdm import tqdm

from extstats.parse_infos import extract_manifest_of_file

SOURCE_DIR = '/media/rob/backup/all_crx/crx/'
DEST_DIR = '/media/rob/backup/exts/'

for file in tqdm(os.listdir(SOURCE_DIR)):
    ext_id = file.replace('.crx', '')
    fullpath = SOURCE_DIR + file
    data = extract_manifest_of_file(fullpath)
    if data:
        version = data['version']
        dest_dir = DEST_DIR + ext_id + '/'
        destpath = dest_dir + version + '.zip'
        if os.path.exists(destpath) and os.path.isdir(destpath):
            shutil.rmtree(destpath)
        if not os.path.exists(destpath):
            os.makedirs(dest_dir, exist_ok=True)
            shutil.copy(fullpath, destpath)
import os, shutil
from tqdm import tqdm

from extstats.parse_infos import extract_manifest_of_file

SOURCE_DIR = 'crawled/crx4chrome/'
DEST_DIR = 'crawled/crx/'

for ext_id in os.listdir(SOURCE_DIR):
    dir_fullpath = SOURCE_DIR + ext_id
    for version_file in os.listdir(dir_fullpath):
        version_path = dir_fullpath + '/' + version_file
        data = extract_manifest_of_file(version_path)
        if data:
            version = data['version']
            dest_dir = DEST_DIR + ext_id + '/'
            destpath = dest_dir + version + '.zip'
            if not os.path.exists(destpath):
                print('adding', ext_id, version)
                os.makedirs(dest_dir, exist_ok=True)
                shutil.copy(version_path, destpath)
import os, shutil
from tqdm import tqdm

from extstats.parse_infos import extract_manifest_of_file

SOURCE_DIR = '/media/rob/backup/all_crx/crx/'
DEST_DIR = '/media/rob/backup/exts/'

for file in tqdm(os.listdir(SOURCE_DIR)):
    ext_id = file.replace('.crx', '')
    fullpath = SOURCE_DIR+file
    data = extract_manifest_of_file(fullpath)
    if data:
        version = data['version']
        dest_dir = DEST_DIR + ext_id + '/'
        destpath = dest_dir + version + '.zip'
        if os.path.exists(destpath) and os.path.isdir(destpath):
            shutil.rmtree(destpath)
        if not os.path.exists(destpath):
            os.makedirs(dest_dir, exist_ok=True)
            shutil.copy(fullpath, destpath)