Python load_file示例，data_load.base.utils.file_utils.load_file Python示例

示例#1

0

显示文件

    def process_batches(self):
        batch_file_names = []
        for batch_file_name in os.listdir(TEMP_DIR):
            file_path = os.path.join(TEMP_DIR, batch_file_name)
            if os.path.isfile(file_path) and batch_file_name.startswith('batch_'):
                batch_file_names.append(batch_file_name)

        print "Generated ", len(batch_file_names), 'batch file names'

        batch_file_names.sort()

        if len(batch_file_names) == 0:
            batch_file_names = self.split_to_batches()

        print len(batch_file_names)
        raw_input('Continue?')

        processed_batches = file_utils.load_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json')
        for batch_file_name in batch_file_names:
            if batch_file_name not in processed_batches:
                print 'Loading batch', batch_file_name
                batch = file_utils.load_file(TEMP_DIR, batch_file_name)
                self.copy_docs_batch(batch)
                processed_batches[batch_file_name] = 0
                file_utils.save_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json', processed_batches)

示例#2

0

显示文件

    def process_batches(self):
        load_config = self.get_load_config()
        generated_files_directory = load_config.data_source_directory()
        other_files_directory = load_config.other_files_directory()

        batch_file_names = []
        for batch_file_name in os.listdir(generated_files_directory):
            file_path = os.path.join(generated_files_directory,
                                     batch_file_name)
            if os.path.isfile(file_path) and batch_file_name.startswith(
                    'batch_'):
                batch_file_names.append(batch_file_name)

        print "Generated ", len(batch_file_names), 'batch file names'

        batch_file_names.sort()

        if len(batch_file_names) == 0:
            batch_file_names = self.split_to_batches()

        processed_batches = file_utils.load_file(
            other_files_directory, 'processed_spires_pubmed_batches.json')
        for batch_file_name in batch_file_names:
            if batch_file_name not in processed_batches:
                print 'Loading batch', batch_file_name
                batch = file_utils.load_file(generated_files_directory,
                                             batch_file_name)
                self.process_batch(batch)
                processed_batches[batch_file_name] = 0
                file_utils.save_file(other_files_directory,
                                     'processed_spires_pubmed_batches.json',
                                     processed_batches)

示例#3

0

显示文件

文件： generate_grant_numbers.py 项目： rrosiek/pc_data_load

    def process_irdb_relations(self):
        generated_files_directory = self.load_config.data_source_directory()
        all_ids = file_utils.load_file(generated_files_directory,
                                       self.batch_file_name)

        processed_count = 0
        batch_count = 0

        ids_to_update = file_utils.load_file(
            generated_files_directory, 'ids_to_update_' + self.batch_file_name)
        processed_ids = file_utils.load_file(
            generated_files_directory, 'processed_ids_' + self.batch_file_name)

        filtered_ids = []
        for _id in all_ids:
            if _id not in processed_ids:
                filtered_ids.append(_id)

        print 'Processing', self.batch_file_name, len(filtered_ids), 'ids'

        self.batch_fetch_docs(filtered_ids, ID_IRDB)

        for _id in filtered_ids:
            processed_ids[_id] = ''

            processed_count += 1
            batch_count += 1

            if processed_count % 500 == 0:
                print 'Processing irdb', _id

            # print 'Processing', processed_count, '/', total_count
            derwent_ids = self.process_id(_id)
            if len(derwent_ids) > 0:
                # print ex_rl
                ids_to_update[_id] = derwent_ids

            if batch_count >= 500:
                batch_count = 0

                file_utils.save_file(generated_files_directory,
                                     'ids_to_update_' + self.batch_file_name,
                                     ids_to_update)
                file_utils.save_file(generated_files_directory,
                                     'processed_ids_' + self.batch_file_name,
                                     processed_ids)

        file_utils.save_file(generated_files_directory,
                             'ids_to_update_' + self.batch_file_name,
                             ids_to_update)
        file_utils.save_file(generated_files_directory,
                             'processed_ids_' + self.batch_file_name,
                             processed_ids)

        # file_utils.save_file(generated_files_directory, 'missing_pubmed_ids_' + self.batch_file_name, self.missing_pubmed_ids)

        print 'Docs to update..............................................', len(
            ids_to_update)

示例#4

0

显示文件

def process_file(load_config, batch_file_name):
    generated_files_directory = load_config.generated_files_directory()
    # print 'Processing batch', batch_file_name

    processes_ids_file_name = 'processed_ids_' + batch_file_name
    ids_to_update_file_name = 'ids_to_update_' + batch_file_name

    # Get processed ids
    processed_ids = file_utils.load_file(generated_files_directory,
                                         processes_ids_file_name)
    if processed_ids is None or len(processed_ids) == 0:
        print 'Processed ids file not found, aborting...'
        return

    # Get batch ids
    batch_ids = file_utils.load_file(generated_files_directory,
                                     batch_file_name)
    if batch_ids is None or len(batch_ids) == 0:
        print 'batch ids not found, aborting....'
        return

    # Continue processing
    # print batch_file_name, 'Processed ids count:', len(processed_ids), ' ~ ', len(batch_ids)

    # if len(processed_ids) != len(batch_ids):
    #     print 'Processing not finished, aborting...'
    #     return
    # else:
    #     print 'Processing complete for', self.batch_file_name, ', proceeding with data load...'

    ids_to_update = file_utils.load_file(generated_files_directory,
                                         ids_to_update_file_name)
    total_count = len(ids_to_update)
    count = 0

    reformatted_array = {}
    for _id in ids_to_update:
        count += 1
        derwent_ids = ids_to_update[_id]

        if _id not in reformatted_array:
            reformatted_array[_id] = []

        if len(derwent_ids) > 0:
            relationship = {
                'index_id': ID_DERWENT_PATENTS,
                'ids': derwent_ids,
                'type': RELATIONSHIP_TYPE_RELATIONS
            }

            reformatted_array[_id].append(relationship)

    # print 'Reformatted ids', len(reformatted_array)

    return reformatted_array, ids_to_update

示例#5

0

显示文件

文件： update_manager.py 项目： rrosiek/pc_data_load

def verify_citations():
    load_config = pubmed_load_config.get_load_config()
    load_config.process_count = 1

    FTPManager(load_config).download_new_update_files()

    print 'Loading pubmed ids...'

    doc_ids = file_utils.load_file(
        load_config.index, load_config.index + '_ids.json')

    if len(doc_ids) == 0:
        doc_ids = export_doc_ids(load_config.server, load_config.index,
                                 load_config.type, load_config.index, load_config.index + '_ids.json')

    print len(doc_ids), 'Total pubmed ids'

    total_new_update_files = get_all_update_files(load_config)

    filtered_update_files = []
    for update_file in total_new_update_files:
        if '1010.xml' in update_file:
            filtered_update_files.append(update_file)

    print 'Total update files:', len(filtered_update_files)
    print filtered_update_files
    if len(filtered_update_files) > 0:
        for new_update_file in filtered_update_files:
            print 'Processing file:', new_update_file
            verify_citations = VerifyCitations(new_update_file, doc_ids)
            verify_citations.process_file()

示例#6

0

显示文件

 def load_update_summary(self, update_file):
     update_record_file_name = self.get_update_summary_file_name(
         update_file)
     # print 'update_record_file_name', update_record_file_name
     return file_utils.load_file(
         self.get_update_records_directory(DIR_UPDATE_SUMMARY),
         update_record_file_name)

示例#7

0

显示文件

def run():
    directory = '/data/data_loading/scripts/data-load-n/data_load/irdb/new_sample_data'
    file_format = '.json'

    missing_fields = {}

    for name in os.listdir(directory):
        file_path = os.path.join(directory, name)
        if os.path.isfile(file_path) and name.endswith(file_format):
            data = file_utils.load_file(directory, name)

            if name in fields_mapping:
                fields = fields_mapping[name]

                missing_fields_for_file = []
                for data_item in data:
                    for field in fields:
                        if field not in data_item:
                            missing_fields_for_file.append(field)
                    break

                missing_fields[name] = missing_fields_for_file

    for name in missing_fields:
        print name
        print missing_fields[name]

        print '-----------------------------'

示例#8

0

显示文件

文件： file_manager.py 项目： rrosiek/pc_data_load

def get_downloaded_files(load_config):
    other_files_directory = load_config.other_files_directory()
    downloaded_files = file_utils.load_file(other_files_directory,
                                            GRANTS_DOWNLOADED_FILES)
    if len(downloaded_files) == 0:
        downloaded_files = []
    return downloaded_files

示例#9

0

显示文件

文件： ftp_manager.py 项目： rrosiek/pc_data_load

 def get_downloaded_update_file_urls(self):
     other_files_directory = self.load_config.other_files_directory()
     downloaded_update_file_urls = file_utils.load_file(
         other_files_directory, DOWNLOADED_UPDATE_FILES)
     if len(downloaded_update_file_urls) == 0:
         return []
     return downloaded_update_file_urls

示例#10

0

显示文件

文件： update_manager.py 项目： rrosiek/pc_data_load

def process_update_record(load_config, update_record_name):
    logger = log_utils.create_logger('pubmed2018_update', load_config.log_files_directory())
    logger.info('Loading update record: ' + str(update_record_name))

    update_records_directory = get_update_records_directory(load_config)
    update_record = file_utils.load_file(update_records_directory, update_record_name)

    local_date = update_record['local_date']
    update_data = update_record['update_data']
    docs_with_new_citations = update_record['docs_with_new_citations']

    logger.info('Update record loaded')
    logger.info('Date: ' + str(local_date))
    logger.info('Update files: ')
    for update_file in update_data:
        logger.info(update_file)

    logger.info('Docs with new citations: ' + str(len(docs_with_new_citations)))

    all_prospects = send_prospective_citations_notifications(logger, docs_with_new_citations)

    # Send update notification
    logger.info('Sending update status mail...')
    EmailClient.send_update_notifications(local_date, update_data, all_prospects)

    logger.info('Done')

示例#11

0

显示文件

文件： load_manager.py 项目： rrosiek/pc_data_load

 def load_tasks_list(self):
     # print 'Loading tasks list', self.root_directory
     tasks_list = file_utils.load_file(self.root_directory,
                                       'tasks_list.json')
     if len(tasks_list) == 0:
         tasks_list = []
     return tasks_list

示例#12

0

显示文件

def run():
    load_config = irdb_load_config.get_load_config()
    load_config.data_source_name = DS_PATENT_RELATIONS
    data_source_batch_name = 'loaded_ids'

    data_source_batch_directory = load_config.data_source_batch_directory(
        data_source_batch_name)

    all_updated_ids = {}
    all_indexed_ids = {}
    all_failed_ids = {}

    for name in os.listdir(data_source_batch_directory):
        file_path = os.path.join(data_source_batch_directory, name)
        if os.path.isfile(file_path) and name.startswith(
                DATA_LOADER_BATCH_PREFIX):
            # print 'processing file:', file_path
            batch_data = file_utils.load_file(data_source_batch_directory,
                                              name)
            updated_ids = batch_data['updated_ids']
            indexed_ids = batch_data['indexed_ids']
            failed_ids = batch_data['failed_ids']

            for _id in updated_ids:
                all_updated_ids[_id] = 0

            for _id in indexed_ids:
                all_indexed_ids[_id] = 0

            for _id in failed_ids:
                all_failed_ids[_id] = 0

    print len(all_failed_ids), 'all_failed_ids'
    print len(all_indexed_ids), 'all_indexed_ids'
    print len(all_updated_ids), 'all_updated_ids'

示例#13

0

显示文件

文件： fix_citations.py 项目： rrosiek/pc_data_load

    def process_completed(self):
        # if len(self.citation_errors) == 0:
        #     self.citation_errors = file_utils.load_file(self.batch_docs_directory(), 'citation_errors.json')

        # print len(self.citation_errors), 'citation errors'
        # print self.citation_errors.keys()
        # file_utils.save_file(self.batch_docs_directory(), 'citation_errors.json', self.citation_errors)

        batch_file_names = []
        for batch_file_name in os.listdir(self.batch_docs_directory()):
            file_path = os.path.join(self.batch_docs_directory(),
                                     batch_file_name)
            if os.path.isfile(file_path) and batch_file_name.startswith(
                    'citation_errors_batch_'):
                batch_file_names.append(batch_file_name)

        citation_errors = {}
        for batch_file_name in batch_file_names:
            print 'Loading batch', batch_file_name
            batch = file_utils.load_file(self.batch_docs_directory(),
                                         batch_file_name)
            for _id in batch:
                citation_errors[_id] = batch[_id]

        print len(citation_errors), 'citation errors'
        print citation_errors.keys()

        raw_input('Load Citations?')

示例#14

0

显示文件

文件： batch_processor.py 项目： rrosiek/pc_data_load

 def start_process_doc_batch(self, batch_file_name):
     print 'Loading batch', batch_file_name
     batch = file_utils.load_file(self.batch_docs_directory(),
                                  batch_file_name)
     batch_name = batch_file_name.split('.')[0]
     results = self.process_docs_batch(batch, batch_name)
     file_utils.save_file(self.batch_docs_directory(),
                          RESULTS_FILE_PREFIX + batch_file_name, results)

示例#15

0

显示文件

文件： file_manager.py 项目： rrosiek/pc_data_load

def get_processed_files(load_config):
    other_files_directory = load_config.other_files_directory()
    processed_file_urls = file_utils.load_file(
        other_files_directory, PROCESSED_UPDATE_FILES)
    if len(processed_file_urls) == 0:
        return []

    return processed_file_urls

示例#16

0

显示文件

文件： verify2.py 项目： rrosiek/pc_data_load

    def process_batch(self, load_config, batch_file_name):
        generated_files_directory = load_config.generated_files_directory()

        ids_to_update_file_name = 'ids_to_update_' + batch_file_name
        ids_to_update = file_utils.load_file(generated_files_directory,
                                             ids_to_update_file_name)

        return ids_to_update

示例#17

0

显示文件

    def get_loaded_ids(self, reports_directory):
        loaded_ids = {}
        for name in os.listdir(reports_directory):
            file_path = os.path.join(reports_directory, name)
            if os.path.isfile(file_path) and name.startswith("loaded_ids_"):
                # print 'processing file:', name
                batch_data = file_utils.load_file(reports_directory, name)
                for _id in batch_data:
                    loaded_ids[_id] = 0

        return loaded_ids

示例#18

0

显示文件

文件： pubmed_load_manager.py 项目： rrosiek/pc_data_load

    def save_update_record(self):
        update_records_name = 'pubmed_update_records.json'

        update_records = file_utils.load_file(
            self.pubmed_updater.get_update_records_directory(),
            update_records_name)
        if len(update_records) == 0:
            update_records = []

        # update_records_for_date = []
        # if self.local_date_time in update_records:
        #     update_records_for_date = update_records[self.local_date_time]

        # update_records_for_date.extend(self.files_to_process)
        # update_records[self.local_date_time] = update_records_for_date

        update_data = self.pubmed_updater.generate_update_summary(
            self.files_to_process)

        update_file_records = []
        for update_file_path in update_data:
            update_file_name = os.path.basename(update_file_path)
            update_data_for_file = update_data[update_file_path]

            articles_processed = len(
                update_data_for_file['articles_processed'])
            new_articles = len(update_data_for_file['new_articles'])
            updated_articles = articles_processed - new_articles

            update_file_record_item = {
                'file_name': update_file_name,
                'file_path': update_file_path,
                'total_articles': articles_processed,
                'new_articles': new_articles,
                'updated_articles': updated_articles
            }

            update_file_records.append(update_file_record_item)

        update_record_item = {
            'date': self.local_date_time,
            'update_files': update_file_records
        }

        update_records.append(update_record_item)

        file_utils.save_file(
            self.pubmed_updater.get_update_records_directory(),
            update_records_name, update_records)

        # Save processed files list
        file_manager.update_processed_files(self.get_load_config(),
                                            self.files_to_process)

示例#19

0

显示文件

文件： ct_publications_relationship_processor.py 项目： rrosiek/pc_data_load

    def run(self):
        # doc_ids = export_doc_ids( self.server, self.index,
        #                             self.type, self.index + '_' + self.type , 'doc_ids.json')
        doc_ids = file_utils.load_file( self.ct_load_config.index, self.ct_load_config.index + '_ids.json')

        if len(doc_ids) == 0:
            doc_ids = export_doc_ids.export_doc_ids(self.ct_load_config.server, self.ct_load_config.index, self.ct_load_config.type)

        doc_ids = doc_ids.keys()

        self.data_utils.batch_fetch_docs_for_ids(base_url= self.ct_load_config.server,
                                            ids=doc_ids,
                                            index= self.ct_load_config.index,
                                            type= self.ct_load_config.type,
                                            docs_fetched=self.docs_fetched)

        print 'Total pubmed relations', len(self.pubmed_relations)
        print 'Total ct relations', len(self.pubmed_relations)

        # Load Pubmed relations
        pubmed_ids = {}
        pubmed_ids = data_mapper.reformat(reformatted_array=pubmed_ids,
                                          relations_array=self.pubmed_relations,
                                          dest_index_id=ID_CLINICAL_TRIALS,
                                          relationship_type=RELATIONSHIP_TYPE_CITATIONS)

        print 'Reformatted pubmed ids', len(pubmed_ids)

        self.pubmed_load_config.append_relations = True
        self.pubmed_load_config.source = 'ct_publications'
        self.pubmed_load_config.data_source_name = 'ct_publications_relations'

        data_load_batcher = DataLoadBatcher(self.pubmed_load_config,  self.pubmed_load_config.index,  self.pubmed_load_config.type)
        data_load_batcher.load_relationships = True
        data_load_batcher.process_data_rows('pubmed_ct_citations', pubmed_ids)

        # Load Clinical trials relations
        ct_ids = {}
        ct_ids = data_mapper.reformat(reformatted_array=ct_ids,
                                      relations_array=self.ct_relations,
                                      dest_index_id=ID_PUBMED,
                                      relationship_type=RELATIONSHIP_TYPE_CITED_BYS)
        print 'Reformatted ct ids', len(ct_ids)

        self.ct_load_config.append_relations = True
        self.ct_load_config.source = 'ct_publications'
        self.ct_load_config.data_source_name = 'ct_publications_relations'

        data_load_batcher = DataLoadBatcher(self.ct_load_config,  self.ct_load_config.index,  self.ct_load_config.type)
        data_load_batcher.load_relationships = True
        data_load_batcher.process_data_rows('ct_pubmed_cited_bys', ct_ids)

示例#20

0

显示文件

    def run(self):
        # self.get_updated_docs()
        self.updated_docs = file_utils.load_file(
            self.load_config.other_files_directory(), 'updated_docs.json')
        print 'Updated docs:', len(self.updated_docs)
        print 'Original docs:', len(self.original_docs)

        # self.get_original_docs()
        # sys.exit(1)

        self.original_docs = file_utils.load_file(
            self.load_config.other_files_directory(), 'original_docs.json')
        self.inverted_index = file_utils.load_file(
            self.load_config.other_files_directory(), 'inverted_index.json')
        self.inverted_index_for_updated_docs = file_utils.load_file(
            self.load_config.other_files_directory(),
            'inverted_index_for_updated_docs.json')

        print 'Updated docs:', len(self.updated_docs)
        print 'Original docs:', len(self.original_docs)
        print 'Inverted index:', len(self.inverted_index)
        print 'inverted_index_for_updated_docs:', len(
            self.inverted_index_for_updated_docs)
        # print json.dumps(self.inverted_index_for_updated_docs)
        # input = raw_input('Continue?')
        # if input.lower() in ['n', 'no', '0']:
        #     sys.exit(1)

        self.update_docs()

        print 'Docs with updates', len(self.docs_with_updates)
        # print json.dumps(self.docs_with_updates)

        print 'Missing docs'
        print json.dumps(self.missing_docs.keys())

        file_utils.save_file(self.load_config.other_files_directory(),
                             'docs_with_updates.json', self.docs_with_updates)

示例#21

0

显示文件

    def save_new_pmids(self, update_files):
        load_config = self.load_manager.get_load_config()
        self.existing_pubmed_ids = file_utils.load_file(
            load_config.other_files_directory(), ALL_PUBMED_IDS_FILE)

        update_summary = self.generate_update_summary(update_files)
        for update_file in update_summary:
            update_summary_for_file = update_summary[update_file]
            articles_processed = update_summary_for_file['articles_processed']
            for _id in articles_processed:
                self.existing_pubmed_ids[_id] = None

        file_utils.save_file(load_config.other_files_directory(),
                             ALL_PUBMED_IDS_FILE, self.existing_pubmed_ids)

示例#22

0

显示文件

文件： verify2.py 项目： rrosiek/pc_data_load

def get_doc_ids(load_config, index_id):
    other_files_directory = load_config.other_files_directory()
    file_name = 'DOC_IDS_' + index_id + '.json'

    print 'Loading', file_name
    doc_ids = file_utils.load_file(other_files_directory, file_name)
    if len(doc_ids) == 0:
        doc_ids = export_doc_ids.export_doc_ids(
            server=SERVER,
            src_index=INDEX_MAPPING[index_id]['index'],
            src_type=INDEX_MAPPING[index_id]['type'],
            dest_dir=other_files_directory,
            dest_file_name=file_name)

    return doc_ids

示例#23

0

显示文件

文件： pubmed_load_manager.py 项目： rrosiek/pc_data_load

    def load_prospects(self):
        prospects_file_name = self.get_prospects_file_name()
        update_records_directory = self.pubmed_updater.get_update_records_directory(
            DIR_PROSPECTS)

        self.get_logger().info('Loading prospects...' +
                               update_records_directory + ' ' +
                               prospects_file_name)

        update_record = file_utils.load_file(update_records_directory,
                                             prospects_file_name)
        if 'prospects' in update_record:
            return update_record['prospects']
        else:
            return None

示例#24

0

显示文件

文件： missing_ids.py 项目： rrosiek/pc_data_load

    def check_tags_and_annotations(self):
        missing_ids = file_utils.load_file(missing_ids_directory,
                                           'missing_ids.json')
        new_ids = file_utils.load_file(missing_ids_directory, 'new_ids.json')

        print 'Missing ids', len(missing_ids)
        print 'New ids', len(new_ids)

        docs_with_tags = self.fetch_ids()

        missing_docs_with_tags = []
        for _id in missing_ids:
            if _id in docs_with_tags:
                missing_docs_with_tags.append(_id)
                print 'Missing docs with tags', _id

        print 'Missing docs with tags', len(missing_docs_with_tags)
        print 'Missing docs with tags', json.dumps(missing_docs_with_tags)

        for _id in missing_docs_with_tags:
            existing_doc = self.get_existing_doc(_id)
            if 'userTags' in existing_doc:
                user_tags = existing_doc['userTags']
                for user_tag in user_tags:
                    added_by = user_tag['added_by']

                    if added_by == '*****@*****.**':
                        self.docs_for_dolan[_id] = existing_doc
                        print _id
                        print user_tags

                    break

        print 'Docs for Dolan', len(self.docs_for_dolan)

        print 'Docs for Dolan', self.docs_for_dolan.keys()

示例#25

0

显示文件

    def run(self):
        # doc_ids = export_doc_ids( self.server, self.index,
        #                             self.type, self.index + '_' + self.type , 'doc_ids.json')

        doc_ids = file_utils.load_file(self.index, self.index + '_ids.json')

        if len(doc_ids) == 0:
            doc_ids = export_doc_ids.export_doc_ids(self.server, self.index,
                                                    self.type)

        doc_ids = doc_ids.keys()

        batch_doc_processor = BatchDocProcessor(doc_ids, self.process_batch,
                                                self.batch_size,
                                                self.process_count,
                                                self.process_spawn_delay)
        batch_doc_processor.run()

示例#26

0

显示文件

    def combine_original_docs(self):
        files = []
        generated_files_directory = self.load_config.generated_files_directory(
        )
        for name in os.listdir(generated_files_directory):
            file_path = os.path.join(generated_files_directory, name)
            if os.path.isfile(file_path) and name.startswith('original_docs_'):
                files.append(name)

        combined = {}
        for name in files:
            data = file_utils.load_file(generated_files_directory, name)
            combined.update(data)

        print 'Original docs', len(combined)
        file_utils.save_file(self.load_config.other_files_directory(),
                             'original_docs.json', combined)

示例#27

0

显示文件

def run():
    load_config = irdb_load_config.get_load_config()
    load_config.data_source_name = DS_EXTENDED_RELATIONS

    generated_files_directory = load_config.generated_files_directory()

    batch_file_names = []
    for batch_file_name in os.listdir(generated_files_directory):
        file_path = os.path.join(generated_files_directory, batch_file_name)
        if os.path.isfile(file_path) and batch_file_name.startswith('batch_'):
            batch_file_names.append(batch_file_name)

    for batch_file_name in batch_file_names:
        batch = file_utils.load_file(generated_files_directory,
                                     batch_file_name)
        if len(batch) < 1000:
            print batch_file_name, len(batch)

示例#28

0

显示文件

    def combine_inverted_index(self):
        files = []
        generated_files_directory = self.load_config.generated_files_directory(
        )
        for name in os.listdir(generated_files_directory):
            file_path = os.path.join(generated_files_directory, name)
            if os.path.isfile(file_path) and name.startswith(
                    'inverted_index_'):
                files.append(name)

        combined = {}
        for name in files:
            data = file_utils.load_file(generated_files_directory, name)
            combined.update(data)

        print 'Inverted index', len(combined)
        file_utils.save_file(self.load_config.other_files_directory(),
                             'inverted_index.json', combined)

示例#29

0

显示文件

def analyse_failed_docs_in_batch(data_loader_batch_directory):
    # print '...Processing', data_loader_batch_directory
    for name in os.listdir(data_loader_batch_directory):
        file_path = os.path.join(data_loader_batch_directory, name)
        if os.path.isfile(file_path) and name.startswith("failed_docs_"):
            failed_docs = file_utils.load_file(data_loader_batch_directory,
                                               name)
            print file_path, '- Failed docs', len(failed_docs)
            if len(failed_docs) > 0:
                a = raw_input('List docs? (y/n)')
                if a.lower() in ['y', 'yes']:
                    for _id in failed_docs:
                        reason = failed_docs[_id]['reason']
                        print 'Doc:', _id
                        print 'Reason', reason
                        c = raw_input('Continue?')
                        if c.lower() in ['n', 'no']:
                            break

示例#30

0

显示文件

文件： load_manager.py 项目： rrosiek/pc_data_load

    def get_config(self):
        file_utils.make_directory(DATA_LOADING_DIRECTORY)
        config = file_utils.load_file(DATA_LOADING_DIRECTORY, self.config_file)
        if len(config) == 0:
            config = self.create_config()

        self.root_directory = config['root_directory']
        self.index_id = config['index_id']
        self.server = config['server']
        self.server_username = config['server_username']
        self.server_password = config['server_password']
        self.index = config['index']
        self.type = config['type']
        self.src_data_exists = config['src_data_exists']
        if 'src_data_directory' in config:
            self.src_data_directory = config['src_data_directory']
        if 'local_date_time' in config:
            self.local_date_time = config['local_date_time']

        return config