Exemplo n.º 1
0
def update():
    """Download and update file"""
    save_file = os.path.join(save_dir, 'pubmed_oa_*_*_*.parquet')
    file_list = list(filter(os.path.isdir, glob(save_file)))
    if file_list:
        d = re.search('[0-9]+_[0-9]+_[0-9]+', file_list[0]).group(0)
        date_file = datetime.strptime(d, '%Y_%m_%d')
        date_update = get_update_date(option='oa')
        # if update is newer
        is_update = date_update > date_file
        if is_update:
            print("MEDLINE update available!")
            subprocess.call(['rm', '-rf', os.path.join(save_dir, 'pubmed_oa_*_*_*.parquet')]) # remove
            subprocess.call(['rm', '-rf', download_dir, 'pubmed_oa'])
            subprocess.call(['wget', 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.A-B.xml.tar.gz', '--directory', download_dir])
            if not os.path.isdir(unzip_dir): os.mkdir(unzip_dir)
            subprocess.call(['tar', '-xzf', os.path.join(download_dir, 'non_comm_use.A-B.xml.tar.gz'), '--directory', unzip_dir])
        else:
            print("No update available")
    else:
        print("Download Pubmed Open-Access for the first time")
        is_update = True
        date_update = get_update_date(option='oa')
        subprocess.call(['wget', 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.A-B.xml.tar.gz', '--directory', download_dir])
        if not os.path.isdir(unzip_dir): os.mkdir(unzip_dir)
        subprocess.call(['tar', '-xzf', os.path.join(download_dir, 'non_comm_use.A-B.xml.tar.gz'), '--directory', unzip_dir])
    return is_update, date_update
Exemplo n.º 2
0
def update():
    """Download and update file"""
    save_file = os.path.join(save_dir, 'medline*_*_*_*.parquet')
    file_list = list(filter(os.path.isdir, glob(save_file)))
    if file_list:
        d = re.search('[0-9]+_[0-9]+_[0-9]+', file_list[0]).group(0)
        date_file = datetime.strptime(d, '%Y_%m_%d')
        date_update = get_update_date(option='medline')
        # if update is newer
        is_update = date_update > date_file
        if is_update:
            print("MEDLINE update available!")
            subprocess.call(['rm', '-rf', os.path.join(save_dir, 'medline_*.parquet')]) # remove
            subprocess.call(['rm', '-rf', download_dir])
            # only example for 3 files, change to ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/*.xml.gz to download all
            subprocess.call(['wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0001.xml.gz', '--directory', download_dir])
            subprocess.call(['wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0166.xml.gz', '--directory', download_dir])
            subprocess.call(['wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0718.xml.gz', '--directory', download_dir])
        else:
            print("No update available")
    else:
        print("Download MEDLINE for the first time")
        is_update = True
        date_update = get_update_date(option='medline')
        subprocess.call(['wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0001.xml.gz', '--directory', download_dir])
        subprocess.call(['wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0166.xml.gz', '--directory', download_dir])
        subprocess.call(['wget', 'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0718.xml.gz', '--directory', download_dir])
    return is_update, date_update
Exemplo n.º 3
0
def update():
    """Download and update file"""
    save_file = os.path.join(save_dir, 'medline*_*_*_*.csv')
    file_list = list(filter(os.path.isdir, glob(save_file)))
    if file_list:
        d = re.search('[0-9]+_[0-9]+_[0-9]+', file_list[0]).group(0)
        date_file = datetime.strptime(d, '%Y_%m_%d')
        date_update = get_update_date(option='medline')
        # if update is newer
        is_update = date_update > date_file
        if is_update:
            print("MEDLINE update available!")
            subprocess.call(
                ['rm', '-rf',
                 os.path.join(save_dir, 'medline_*.csv')])  # remove
            subprocess.call(['rm', '-rf', download_dir])
            subprocess.call([
                'wget',
                'ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/*.xml.gz',
                '--directory', download_dir
            ])
        else:
            print("No update available")
    else:
        print("Download MEDLINE for the first time")
        is_update = True
        date_update = get_update_date(option='medline')
        subprocess.call([
            'wget', 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/*.xml.gz',
            '--directory', download_dir
        ])
    return is_update, date_update
Exemplo n.º 4
0
def update():
    """Download and update file"""
    save_file = os.path.join(save_dir, 'medline*_*_*_*.parquet')
    file_list = list(filter(os.path.isdir, glob(save_file)))
    if file_list:
        d = re.search('[0-9]+_[0-9]+_[0-9]+', file_list[0]).group(0)
        date_file = datetime.strptime(d, '%Y_%m_%d')
        date_update = get_update_date(option='medline')
        # if update is newer
        is_update = date_update > date_file
        if is_update:
            print("MEDLINE update available!")
            subprocess.call(
                ['rm', '-rf',
                 os.path.join(save_dir, 'medline_*.parquet')])  # remove
            subprocess.call(['rm', '-rf', download_dir])
            # only example for 3 files, change to ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/*.xml.gz to download all
            subprocess.call([
                'wget',
                'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0001.xml.gz',
                '--directory', download_dir
            ])
            subprocess.call([
                'wget',
                'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0166.xml.gz',
                '--directory', download_dir
            ])
            subprocess.call([
                'wget',
                'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0718.xml.gz',
                '--directory', download_dir
            ])
        else:
            print("No update available")
    else:
        print("Download MEDLINE for the first time")
        is_update = True
        date_update = get_update_date(option='medline')
        subprocess.call([
            'wget',
            'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0001.xml.gz',
            '--directory', download_dir
        ])
        subprocess.call([
            'wget',
            'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0166.xml.gz',
            '--directory', download_dir
        ])
        subprocess.call([
            'wget',
            'ftp://ftp.nlm.nih.gov/nlmdata/.medleasebaseline/gz/medline16n0718.xml.gz',
            '--directory', download_dir
        ])
    return is_update, date_update
Exemplo n.º 5
0
def update():
    """Download and update file"""
    save_file = os.path.join(save_dir, 'pubmed_oa_*_*_*.parquet')
    file_list = list(filter(os.path.isdir, glob(save_file)))
    if file_list:
        d = re.search('[0-9]+_[0-9]+_[0-9]+', file_list[0]).group(0)
        date_file = datetime.strptime(d, '%Y_%m_%d')
        date_update = get_update_date(option='oa')
        # if update is newer
        is_update = date_update > date_file
        if is_update:
            print("MEDLINE update available!")
            subprocess.call([
                'rm', '-rf',
                os.path.join(save_dir, 'pubmed_oa_*_*_*.parquet')
            ])  # remove
            subprocess.call(['rm', '-rf', download_dir, 'pubmed_oa'])
            subprocess.call([
                'wget',
                'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.A-B.xml.tar.gz',
                '--directory', download_dir
            ])
            if not os.path.isdir(unzip_dir): os.mkdir(unzip_dir)
            subprocess.call([
                'tar', '-xzf',
                os.path.join(download_dir, 'non_comm_use.A-B.xml.tar.gz'),
                '--directory', unzip_dir
            ])
        else:
            print("No update available")
    else:
        print("Download Pubmed Open-Access for the first time")
        is_update = True
        date_update = get_update_date(option='oa')
        subprocess.call([
            'wget',
            'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/non_comm_use.A-B.xml.tar.gz',
            '--directory', download_dir
        ])
        if not os.path.isdir(unzip_dir): os.mkdir(unzip_dir)
        subprocess.call([
            'tar', '-xzf',
            os.path.join(download_dir, 'non_comm_use.A-B.xml.tar.gz'),
            '--directory', unzip_dir
        ])
    return is_update, date_update
def main(_args):
    '''
    download colocation data
    
    Parameters
    ----------
    _args : listx
        Arg list secret_key, username and pass dir, csv file specifying download countries and ids, outdir.

    Returns
    -------
    None.

    '''
    
    username = input("Username: "******"Update datasets? (y/n): ")
    
    if update == 'y':
        update = True
    elif update == 'n':
        update = False
    else:
        sys.exit('Unknown update input. Choose "y", "n". Exiting.')
    
    #read target datasets
    data_target = pd.read_csv(_args[1])
    
    for i, dataset_id in enumerate(data_target['id']):
        
        country_output = _args[len(_args) - 1] + "/" + data_target.loc[i, 'country'] + '_mobility'
            
        base_url = 'https://www.facebook.com/geoinsights-portal/downloads/vector/?id=' + str(dataset_id) + '&ds='
    
        earliest_date = datetime(int(data_target.loc[i, 'year']), int(data_target.loc[i, 'month']), int(data_target.loc[i, 'day']), int(data_target.loc[i, 'hour']))    
        
        data_dates = get_file_dates(earliest_date)
                
        if update:
            data_dates = list(compress(data_dates, [x > get_update_date(country_output) for x in data_dates]))
        
        if len(data_dates) == 0:
            sys.exit('No datasets to download. Exiting.')
            
        urls = get_urls(base_url, data_dates)
        
        download_data(urls, keys)
        
    
        move_most_recent_files(country_output, urls)
    
    print('Success.')
Exemplo n.º 7
0
    def test_get_update_date(self):

        self.data = pd.DataFrame({'data': [1, 2, 3, 4, 5]})

        self.assertRaises(ValueError, get_update_date, './tmp1')

        self.data.to_csv('./tmp1/test_2020_01_01.csv')

        self.assertIsInstance(get_update_date('./tmp1'), datetime)
def pull_population(outdir, keys, country, dl_variables, update,
                    population_type):
    '''
    
    Parameters
    ----------
    outdir : str
        Output directory.
    keys : list
        user credentials [username, password].
    country : str
        Country name - must match .config file exactly (names with spaces must replace ' ' with '_').
    dl_variables : dict
        download specific variables in a dict, 'id' = dataset id, 'origin' = dataset origin datetime.datetime object.
    update : boolean
        Whether an existing dataset is being updated.

    Returns
    -------
    None.

    '''

    country_output = outdir + "/" + country + '_' + population_type

    base_url = 'https://www.facebook.com/geoinsights-portal/downloads/raster/?id=' + str(
        dl_variables['id']) + '&ds='

    earliest_date = dl_variables['origin']

    data_dates = get_file_dates(earliest_date)

    if update:
        data_dates = list(
            compress(data_dates,
                     [x > get_update_date(country_output)
                      for x in data_dates]))

    if len(data_dates) == 0:
        sys.exit('No datasets to download. Exiting.')

    urls = get_urls(base_url, data_dates)

    start_time = download_data(urls, keys)

    move_most_recent_files(country_output, urls, start_time)

    remove_empty_files(country_output)

    print('Success.')