Exemplo n.º 1
0
    def run(self):
        old_ids = export_doc_ids(server=SERVER,
                                 src_index=OLD_INDEX,
                                 src_type=OLD_TYPE)

        new_ids = export_doc_ids(server=SERVER,
                                 src_index=NEW_INDEX,
                                 src_type=NEW_TYPE)

        for _id in old_ids:
            if _id not in new_ids:
                self.missing_ids[_id] = 0
                if len(self.missing_ids) % 1000 == 0:
                    print 'Missing ids', len(self.missing_ids)

        for _id in new_ids:
            if _id not in old_ids:
                self.new_ids[_id] = 0
                if len(self.new_ids) % 1000 == 0:
                    print 'New ids', len(self.new_ids)

        print 'Missing ids', len(self.missing_ids)
        print 'New ids', len(self.new_ids)

        file_utils.make_directory(missing_ids_directory)

        file_utils.save_file(missing_ids_directory, 'missing_ids.json',
                             self.missing_ids.keys())
        file_utils.save_file(missing_ids_directory, 'new_ids.json',
                             self.new_ids)
Exemplo n.º 2
0
    def get_update_records_directory(self, sub_directory=None):
        load_config = self.load_manager.get_load_config()
        other_files_directory = load_config.other_files_directory()

        update_records_directory = other_files_directory + '/' + 'update_records'
        if sub_directory is not None:
            update_records_directory += '/' + sub_directory
        file_utils.make_directory(update_records_directory)
        return update_records_directory
Exemplo n.º 3
0
    def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password):
        self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type)
        self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type)

        self.data_utils = DataUtils()

        self.username = username
        self.password = password

        file_utils.make_directory(TEMP_DIR)
Exemplo n.º 4
0
def download_files(year=None):
    files_to_download = get_available_files_to_download(year=year)
    file_utils.make_directory(source_files_directory)

    downloaded_update_file_urls = get_downloaded_files(source_files_directory)
    downloaded_update_file_paths = []

    print 'Downloading', len(files_to_download), 'files...'
    for update_file_url in files_to_download:
        if update_file_url not in downloaded_update_file_urls:
            file_name = os.path.basename(update_file_url)
            update_file_path = os.path.join(source_files_directory, file_name)
            xml_file_path = os.path.join(source_files_directory,
                                         file_name.replace('.zip', '.xml'))

            # Download update zip file
            urllib.urlcleanup()
            print 'Downloading file: ', update_file_url
            urllib.urlretrieve(update_file_url, update_file_path)
            print 'Saved', update_file_path

            # TODO - Verify download with md5?

            # Extract update zip file

            print 'Unzipping file', update_file_path
            try:
                with zipfile.ZipFile(update_file_path, 'r') as zip_ref:
                    zip_ref.extractall(source_files_directory)

                downloaded_update_file_urls.append(update_file_url)
                downloaded_update_file_paths.append(xml_file_path)

            except Exception as e:
                print e

            # f = gzip.open(update_file_path, 'rb')
            # with open(xml_file_path, 'w') as xml_file:
            #     xml_file.write(f.read())
            # f.close()

            # Delete update zip file
            print 'Deleting file', update_file_path
            os.remove(update_file_path)

            # Save the downloaded files list
            set_downloaded_files(source_files_directory,
                                 downloaded_update_file_urls)

    return downloaded_update_file_paths
Exemplo n.º 5
0
    def get_config(self):
        file_utils.make_directory(DATA_LOADING_DIRECTORY)
        config = file_utils.load_file(DATA_LOADING_DIRECTORY, self.config_file)
        if len(config) == 0:
            config = self.create_config()

        self.root_directory = config['root_directory']
        self.index_id = config['index_id']
        self.server = config['server']
        self.server_username = config['server_username']
        self.server_password = config['server_password']
        self.index = config['index']
        self.type = config['type']
        self.src_data_exists = config['src_data_exists']
        if 'src_data_directory' in config:
            self.src_data_directory = config['src_data_directory']
        if 'local_date_time' in config:
            self.local_date_time = config['local_date_time']

        return config
Exemplo n.º 6
0
    def process_file(self):
        file_name = os.path.basename(self.update_file)

        self.load_config.data_source_name = file_name.split('.')[0]

        print self.update_file

        data_source = XMLDataSource(self.update_file, 2)
        data_source.process_rows(self.process_row)

        print self.update_file
        print 'Docs with citations:', len(self.docs_with_citations)
        print 'New Docs:', len(self.new_docs)
        print 'Total Docs:', len(self.total_ids)

        if len(self.docs_with_citations) > 0:
            file_utils.make_directory("docs_with_citations")
            file_utils.save_file("docs_with_citations",
                                 self.load_config.data_source_name + '.json',
                                 self.docs_with_citations)
Exemplo n.º 7
0
    def create_config(self):
        now = datetime.datetime.now()
        self.local_date_time = now.strftime("%m-%d-%Y %H:%M:%S")
        self.root_directory = self.get_root_directory(self.local_date_time)
        file_utils.make_directory(self.root_directory)

        index_item = self.get_info_for_index_id(self.index_id)
        self.index = index_item['index']
        self.type = index_item['index_type']

        if self.should_reload():
            self.index = self.get_next_index_version(self.index)

        print 'local date:', self.local_date_time
        print 'root directory:', self.root_directory
        print 'index_id:', self.index_id
        print 'server:', self.server
        print 'server_username', self.server_username
        print 'index:', self.index
        print 'type:', self.type

        config = self.set_config()
        return config
Exemplo n.º 8
0
def save_files_per_year(files_per_year):
    file_utils.make_directory(source_files_directory)
    file_utils.save_file(source_files_directory, 'files_per_year.json',
                         files_per_year)
Exemplo n.º 9
0
def load_files_per_year():
    file_utils.make_directory(source_files_directory)
    return file_utils.load_file(source_files_directory, 'files_per_year.json')
Exemplo n.º 10
0
 def batch_docs_directory(self):
     directory = self.get_batch_docs_directory()
     file_utils.make_directory(directory)
     return directory
Exemplo n.º 11
0
    def __init__(self, load_config):
        self.load_config = load_config

        file_utils.make_directory(TEMP_DIR)
Exemplo n.º 12
0
 def del_config(self):
     file_utils.make_directory(DATA_LOADING_DIRECTORY)
     file_utils.save_file(DATA_LOADING_DIRECTORY, self.config_file, {})
Exemplo n.º 13
0
def get_update_records_directory(load_config):
    other_files_directory = load_config.other_files_directory()
    update_records_directory = other_files_directory + '/' + 'update_records'
    file_utils.make_directory(update_records_directory)
    return update_records_directory