예제 #1
0
    def run(self):
        old_ids = export_doc_ids(server=SERVER,
                                 src_index=OLD_INDEX,
                                 src_type=OLD_TYPE)

        new_ids = export_doc_ids(server=SERVER,
                                 src_index=NEW_INDEX,
                                 src_type=NEW_TYPE)

        for _id in old_ids:
            if _id not in new_ids:
                self.missing_ids[_id] = 0
                if len(self.missing_ids) % 1000 == 0:
                    print 'Missing ids', len(self.missing_ids)

        for _id in new_ids:
            if _id not in old_ids:
                self.new_ids[_id] = 0
                if len(self.new_ids) % 1000 == 0:
                    print 'New ids', len(self.new_ids)

        print 'Missing ids', len(self.missing_ids)
        print 'New ids', len(self.new_ids)

        file_utils.make_directory(missing_ids_directory)

        file_utils.save_file(missing_ids_directory, 'missing_ids.json',
                             self.missing_ids.keys())
        file_utils.save_file(missing_ids_directory, 'new_ids.json',
                             self.new_ids)
예제 #2
0
 def save_docs_with_new_citations(self, docs_with_new_citations,
                                  update_file):
     update_record_file_name = self.get_docs_with_new_citations_file_name(
         update_file)
     file_utils.save_file(
         self.get_update_records_directory(DIR_DOCS_WITH_NEW_CITATION),
         update_record_file_name, docs_with_new_citations)
예제 #3
0
    def process_batches(self):
        load_config = self.get_load_config()
        generated_files_directory = load_config.data_source_directory()
        other_files_directory = load_config.other_files_directory()

        batch_file_names = []
        for batch_file_name in os.listdir(generated_files_directory):
            file_path = os.path.join(generated_files_directory,
                                     batch_file_name)
            if os.path.isfile(file_path) and batch_file_name.startswith(
                    'batch_'):
                batch_file_names.append(batch_file_name)

        print "Generated ", len(batch_file_names), 'batch file names'

        batch_file_names.sort()

        if len(batch_file_names) == 0:
            batch_file_names = self.split_to_batches()

        processed_batches = file_utils.load_file(
            other_files_directory, 'processed_spires_pubmed_batches.json')
        for batch_file_name in batch_file_names:
            if batch_file_name not in processed_batches:
                print 'Loading batch', batch_file_name
                batch = file_utils.load_file(generated_files_directory,
                                             batch_file_name)
                self.process_batch(batch)
                processed_batches[batch_file_name] = 0
                file_utils.save_file(other_files_directory,
                                     'processed_spires_pubmed_batches.json',
                                     processed_batches)
예제 #4
0
    def get_updated_docs(self):
        load_config = self.get_load_config(updates_directory)
        ftp_manager = FTPManager(load_config)

        update_file_urls = ftp_manager.get_update_file_urls()
        update_file_urls = update_file_urls[:2]

        ftp_manager.download_missing_files(file_urls=update_file_urls,
                                           no_of_files=2)

        all_files = file_manager.get_all_files(load_config)
        files_to_process = all_files[:2]
        # files_to_process = file_manager.get_new_update_files(load_config, update_file_urls, 2)
        print files_to_process

        for update_file in files_to_process:
            file_name = os.path.basename(update_file)
            self.current_update_file = file_name  #file_name.split('.')[0]

            xml_data_source = XMLDataSource(update_file, 2)
            xml_data_source.process_rows(self.process_row)

        print 'Total updated ids:', len(self.updated_docs)

        file_utils.save_file(self.load_config.other_files_directory(),
                             'updated_docs.json', self.updated_docs)
        file_utils.save_file(self.load_config.other_files_directory(),
                             'inverted_index_for_updated_docs.json',
                             self.inverted_index_for_updated_docs)
예제 #5
0
    def process_batches(self):
        batch_file_names = []
        for batch_file_name in os.listdir(TEMP_DIR):
            file_path = os.path.join(TEMP_DIR, batch_file_name)
            if os.path.isfile(file_path) and batch_file_name.startswith('batch_'):
                batch_file_names.append(batch_file_name)

        print "Generated ", len(batch_file_names), 'batch file names'

        batch_file_names.sort()

        if len(batch_file_names) == 0:
            batch_file_names = self.split_to_batches()

        print len(batch_file_names)
        raw_input('Continue?')

        processed_batches = file_utils.load_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json')
        for batch_file_name in batch_file_names:
            if batch_file_name not in processed_batches:
                print 'Loading batch', batch_file_name
                batch = file_utils.load_file(TEMP_DIR, batch_file_name)
                self.copy_docs_batch(batch)
                processed_batches[batch_file_name] = 0
                file_utils.save_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json', processed_batches)
예제 #6
0
def filter_and_split_ids_into_batches(load_config):
    other_files_directory = load_config.other_files_directory()
    generated_files_directory = load_config.data_source_directory()

    all_ids = export_doc_ids.get_doc_ids_for_load_config(load_config)
    # total_count = len(all_ids)

    max_batch_count = BATCH_DOC_COUNT

    batch_index = 0
    batch_ids = []
    # Splitting into batches
    for _id in all_ids:
        batch_ids.append(_id)

        if len(batch_ids) >= max_batch_count:
            print 'Writing batch:', batch_index
            batch_file_name = 'batch_' + str(batch_index) + '.json'
            file_utils.save_file(generated_files_directory, batch_file_name,
                                 batch_ids)

            batch_ids = []
            batch_index += 1

    if len(batch_ids) > 0:
        print 'Writing batch:', batch_index
        batch_file_name = 'batch_' + str(batch_index) + '.json'
        file_utils.save_file(generated_files_directory, batch_file_name,
                             batch_ids)

        batch_index += 1

    print batch_index, 'batches to process'
예제 #7
0
    def split_to_batches(self):
        server = self.src_data_loader_utils.server
        src_index = self.src_data_loader_utils.index
        src_type = self.src_data_loader_utils.type

        print 'Fetching doc ids for', src_index, src_type
        query = {
            "nested": {
                "path": "grants",
                "query": {
                    "bool": {
                        "must": [
                            {
                                "exists": {
                                    "field": "grants"
                                }
                            }
                        ]
                    }
                }
            }
        }

        all_pubmed_ids = export_doc_ids.get_doc_ids(server,
                                                    src_index,
                                                    src_type,
                                                    TEMP_DIR,
                                                    'pubmed2018_docs_with_grants.json', query=query)
        # all_pubmed_ids = all_pubmed_ids.keys()
        # all_pubmed_ids.sort()
        self.total_doc_count = len(all_pubmed_ids)  

        max_batch_count = 5000
        
        batch_file_names = []
        batch_index = 0
        batch_ids = []
        # Splitting into batches
        for _id in all_pubmed_ids:
            batch_ids.append(_id)

            if len(batch_ids) >= max_batch_count:
                print 'Writing batch:', batch_index
                batch_file_name = 'batch_' + str(batch_index) + '.json'
                batch_file_names.append(batch_file_name)
                file_utils.save_file(TEMP_DIR, batch_file_name, batch_ids)

                batch_ids = []
                batch_index += 1

        if len(batch_ids) > 0:
            print 'Writing batch:', batch_index
            batch_file_name = 'batch_' + str(batch_index) + '.json'
            batch_file_names.append(batch_file_name)
            file_utils.save_file(TEMP_DIR, batch_file_name, batch_ids)

            batch_index += 1

        return batch_file_names
예제 #8
0
 def start_process_doc_batch(self, batch_file_name):
     print 'Loading batch', batch_file_name
     batch = file_utils.load_file(self.batch_docs_directory(),
                                  batch_file_name)
     batch_name = batch_file_name.split('.')[0]
     results = self.process_docs_batch(batch, batch_name)
     file_utils.save_file(self.batch_docs_directory(),
                          RESULTS_FILE_PREFIX + batch_file_name, results)
예제 #9
0
    def create_mapping(self, data_directory, format):
        mapping = {}
        for signature in format:
            format_item = format[signature]
            parents = format_item['parents']

            mapping = self.add_mapping(parents, mapping)

        file_utils.save_file(data_directory, 'mapping.json', mapping)
예제 #10
0
def save_update_record_for_date(load_config, local_date, update_data, docs_with_new_citations):
    update_records_directory = get_update_records_directory(load_config)
    local_time = now.strftime("%H:%M:%S")

    update_record = {
        'local_date': local_date,
        'update_data': update_data,
        'docs_with_new_citations': docs_with_new_citations,
    }
    file_utils.save_file(update_records_directory, 'pubmed_update_record_' + local_date + '_' + local_time, update_record)
예제 #11
0
    def save_prospects(self, prospects):
        update_record = {}
        update_record['update_files'] = self.files_to_process
        update_record['prospects'] = prospects
        update_record['date'] = self.local_date_time

        update_records_directory = self.pubmed_updater.get_update_records_directory(
            DIR_PROSPECTS)
        prospects_file_name = self.get_prospects_file_name()

        file_utils.save_file(update_records_directory, prospects_file_name,
                             update_record)
예제 #12
0
    def save_update_record(self):
        update_records_name = 'pubmed_update_records.json'

        update_records = file_utils.load_file(
            self.pubmed_updater.get_update_records_directory(),
            update_records_name)
        if len(update_records) == 0:
            update_records = []

        # update_records_for_date = []
        # if self.local_date_time in update_records:
        #     update_records_for_date = update_records[self.local_date_time]

        # update_records_for_date.extend(self.files_to_process)
        # update_records[self.local_date_time] = update_records_for_date

        update_data = self.pubmed_updater.generate_update_summary(
            self.files_to_process)

        update_file_records = []
        for update_file_path in update_data:
            update_file_name = os.path.basename(update_file_path)
            update_data_for_file = update_data[update_file_path]

            articles_processed = len(
                update_data_for_file['articles_processed'])
            new_articles = len(update_data_for_file['new_articles'])
            updated_articles = articles_processed - new_articles

            update_file_record_item = {
                'file_name': update_file_name,
                'file_path': update_file_path,
                'total_articles': articles_processed,
                'new_articles': new_articles,
                'updated_articles': updated_articles
            }

            update_file_records.append(update_file_record_item)

        update_record_item = {
            'date': self.local_date_time,
            'update_files': update_file_records
        }

        update_records.append(update_record_item)

        file_utils.save_file(
            self.pubmed_updater.get_update_records_directory(),
            update_records_name, update_records)

        # Save processed files list
        file_manager.update_processed_files(self.get_load_config(),
                                            self.files_to_process)
예제 #13
0
 def set_config(self):
     config = {}
     config['root_directory'] = self.root_directory
     config['index_id'] = self.index_id
     config['server'] = self.server
     config['server_username'] = self.server_username
     config['server_password'] = self.server_password
     config['index'] = self.index
     config['type'] = self.type
     config['src_data_exists'] = self.src_data_exists
     config['src_data_directory'] = self.src_data_directory
     config['local_date_time'] = self.local_date_time
     file_utils.save_file(DATA_LOADING_DIRECTORY, self.config_file, config)
     return config
예제 #14
0
    def save_new_pmids(self, update_files):
        load_config = self.load_manager.get_load_config()
        self.existing_pubmed_ids = file_utils.load_file(
            load_config.other_files_directory(), ALL_PUBMED_IDS_FILE)

        update_summary = self.generate_update_summary(update_files)
        for update_file in update_summary:
            update_summary_for_file = update_summary[update_file]
            articles_processed = update_summary_for_file['articles_processed']
            for _id in articles_processed:
                self.existing_pubmed_ids[_id] = None

        file_utils.save_file(load_config.other_files_directory(),
                             ALL_PUBMED_IDS_FILE, self.existing_pubmed_ids)
예제 #15
0
    def run_for_ids(self, doc_ids, mapping=None):
        self.processed_doc_count = 0
        self.total_doc_count = len(doc_ids)

        print 'Total doc count', self.total_doc_count

        print 'Fetching docs from source index'
        batch_doc_processor = BatchDocProcessor(doc_ids, self.copy_docs_batch,
                                                1000, 1, 0)
        batch_doc_processor.run()

        file_utils.save_file('/data/data_loading/pubmed_2019',
                             'missing_docs_pubmed2019.json',
                             self.missing_destination_ids)
예제 #16
0
    def get_all_members(self):
        result_items = []

        params = {}
        params['rows'] = self.no_of_rows
        url = 'members'
        self.cursor = None
        request = self.create_request(url, params)
        response = self.perform_request(request)
        result_items = self.get_result_items_from_response(response)

        print 'Total members', len(result_items)
        file_utils.save_file('/data_loading', 'crossref_members.json',
                             result_items)
        return result_items
예제 #17
0
    def run(self):
        xml_data_source = XMLDataSource(self.baseline_file, 2)
        xml_data_source.process_rows(self.process_baseline_row)

        print len(self.original_docs), self.current_baseline_file
        if len(self.original_docs) > 0:
            file_utils.save_file(
                self.load_config.generated_files_directory(),
                'original_docs_' + self.current_baseline_file + '.json',
                self.original_docs)

        file_utils.save_file(
            self.load_config.generated_files_directory(),
            'inverted_index_' + self.current_baseline_file + '.json',
            self.inverted_index)
예제 #18
0
    def split_to_batches(self):
        load_config = self.get_load_config()
        other_files_directory = load_config.other_files_directory()

        # appl_id__pmid__mapping = file_utils.unpickle_file(other_files_directory, 'appl_id__pmid__mapping.json')
        pmid__appl_id__mapping = file_utils.unpickle_file(
            other_files_directory, 'pmid__appl_id__mapping.json')

        pubmed_ids = {}
        pubmed_ids = load_config.data_mapper.reformat(
            reformatted_array=pubmed_ids,
            relations_array=pmid__appl_id__mapping,
            dest_index_id=ID_IRDB,
            relationship_type=RELATIONSHIP_TYPE_CITATIONS)

        generated_files_directory = load_config.data_source_directory()
        pubmed_id_keys = pubmed_ids.keys()
        pubmed_id_keys.sort()

        max_batch_count = 10000

        batch_index = 0
        batch_ids = {}
        batch_file_names = []
        # Splitting into batches
        for _id in pubmed_id_keys:
            batch_ids[_id] = pubmed_ids[_id]

            if len(batch_ids) >= max_batch_count:
                print 'Writing batch:', batch_index
                batch_file_name = 'batch_' + str(batch_index) + '.json'
                batch_file_names.append(batch_file_name)
                file_utils.save_file(generated_files_directory,
                                     batch_file_name, batch_ids)

                batch_ids = {}
                batch_index += 1

        if len(batch_ids) > 0:
            print 'Writing batch:', batch_index
            batch_file_name = 'batch_' + str(batch_index) + '.json'
            batch_file_names.append(batch_file_name)
            file_utils.save_file(generated_files_directory, batch_file_name,
                                 batch_ids)

            batch_index += 1

        return batch_file_names
예제 #19
0
    def process_irdb_relations(self):
        generated_files_directory = self.load_config.data_source_directory()
        all_ids = file_utils.load_file(generated_files_directory,
                                       self.batch_file_name)

        processed_count = 0
        batch_count = 0

        ids_to_update = file_utils.load_file(
            generated_files_directory, 'ids_to_update_' + self.batch_file_name)
        processed_ids = file_utils.load_file(
            generated_files_directory, 'processed_ids_' + self.batch_file_name)

        filtered_ids = []
        for _id in all_ids:
            if _id not in processed_ids:
                filtered_ids.append(_id)

        print 'Processing', self.batch_file_name, len(filtered_ids), 'ids'

        self.batch_fetch_docs(filtered_ids, ID_IRDB)

        for _id in filtered_ids:
            processed_ids[_id] = ''

            processed_count += 1
            batch_count += 1

            if processed_count % 500 == 0:
                print 'Processing irdb', _id

            # print 'Processing', processed_count, '/', total_count
            derwent_ids = self.process_id(_id)
            if len(derwent_ids) > 0:
                # print ex_rl
                ids_to_update[_id] = derwent_ids

            if batch_count >= 500:
                batch_count = 0

                file_utils.save_file(generated_files_directory,
                                     'ids_to_update_' + self.batch_file_name,
                                     ids_to_update)
                file_utils.save_file(generated_files_directory,
                                     'processed_ids_' + self.batch_file_name,
                                     processed_ids)

        file_utils.save_file(generated_files_directory,
                             'ids_to_update_' + self.batch_file_name,
                             ids_to_update)
        file_utils.save_file(generated_files_directory,
                             'processed_ids_' + self.batch_file_name,
                             processed_ids)

        # file_utils.save_file(generated_files_directory, 'missing_pubmed_ids_' + self.batch_file_name, self.missing_pubmed_ids)

        print 'Docs to update..............................................', len(
            ids_to_update)
예제 #20
0
    def run(self):
        self.processed_doc_count = 0
        self.total_doc_count = self.get_total_doc_count()

        print 'Total doc count', self.total_doc_count

        # self.create_destination_index(mapping=None)
        self.export_doc_ids(server=self.src_data_loader_utils.server,
                            src_index=self.src_data_loader_utils.index,
                            src_type=self.src_data_loader_utils.type)

        print 'saving missing docs'

        file_utils.save_file('/data/data_loading/pubmed_2019',
                             'missing_docs_pubmed2019.json',
                             self.missing_destination_ids)
예제 #21
0
    def combine_original_docs(self):
        files = []
        generated_files_directory = self.load_config.generated_files_directory(
        )
        for name in os.listdir(generated_files_directory):
            file_path = os.path.join(generated_files_directory, name)
            if os.path.isfile(file_path) and name.startswith('original_docs_'):
                files.append(name)

        combined = {}
        for name in files:
            data = file_utils.load_file(generated_files_directory, name)
            combined.update(data)

        print 'Original docs', len(combined)
        file_utils.save_file(self.load_config.other_files_directory(),
                             'original_docs.json', combined)
예제 #22
0
    def combine_inverted_index(self):
        files = []
        generated_files_directory = self.load_config.generated_files_directory(
        )
        for name in os.listdir(generated_files_directory):
            file_path = os.path.join(generated_files_directory, name)
            if os.path.isfile(file_path) and name.startswith(
                    'inverted_index_'):
                files.append(name)

        combined = {}
        for name in files:
            data = file_utils.load_file(generated_files_directory, name)
            combined.update(data)

        print 'Inverted index', len(combined)
        file_utils.save_file(self.load_config.other_files_directory(),
                             'inverted_index.json', combined)
예제 #23
0
    def process(self, data_directory):
        for name in os.listdir(data_directory):
            file_path = os.path.join(data_directory, name)
            if os.path.isfile(file_path) and name.endswith('.xml'):
                print 'Parsing file:', file_path
                xmltodict.parse(open(file_path),
                                item_depth=1,
                                item_callback=self.handle_row)

        file_utils.save_file(data_directory, 'schema.json', self.format)

        # ft = {
        #     "test": {
        #         "data": ["test"],
        #         "parents": [ "clinical_results", "outcome_list", "outcome", "analysis_list", "analysis", "param_type" ]
        #     }
        # }

        self.create_mapping(data_directory, self.format)
예제 #24
0
    def split_to_batches(self):
        print 'Fetching doc ids for', self.load_config.index, self.load_config.type
        query = self.get_query()
        print json.dumps(query)

        all_ids = export_doc_ids.get_doc_ids(self.load_config.server,
                                             self.load_config.index,
                                             self.load_config.type,
                                             self.batch_docs_directory(),
                                             ALL_IDS_FILE,
                                             query=query)
        all_ids = all_ids.keys()
        all_ids.sort()

        print 'all_ids', len(all_ids)

        batch_file_names = []
        batch_index = 0
        batch_ids = []

        # Splitting into batches
        for _id in all_ids:
            batch_ids.append(_id)

            if len(batch_ids) >= self.batch_doc_count:
                print 'Writing batch:', batch_index
                batch_file_name = 'batch_' + str(batch_index) + '.json'
                batch_file_names.append(batch_file_name)
                file_utils.save_file(self.batch_docs_directory(),
                                     batch_file_name, batch_ids)
                batch_ids = []
                batch_index += 1

        if len(batch_ids) > 0:
            print 'Writing batch:', batch_index
            batch_file_name = 'batch_' + str(batch_index) + '.json'
            batch_file_names.append(batch_file_name)
            file_utils.save_file(self.batch_docs_directory(), batch_file_name,
                                 batch_ids)
            batch_index += 1

        return batch_file_names
예제 #25
0
    def process_file(self):
        file_name = os.path.basename(self.update_file)

        self.load_config.data_source_name = file_name.split('.')[0]

        print self.update_file

        data_source = XMLDataSource(self.update_file, 2)
        data_source.process_rows(self.process_row)

        print self.update_file
        print 'Docs with citations:', len(self.docs_with_citations)
        print 'New Docs:', len(self.new_docs)
        print 'Total Docs:', len(self.total_ids)

        if len(self.docs_with_citations) > 0:
            file_utils.make_directory("docs_with_citations")
            file_utils.save_file("docs_with_citations",
                                 self.load_config.data_source_name + '.json',
                                 self.docs_with_citations)
예제 #26
0
    def process_docs(self, docs):
        print 'Processing docs', len(docs)

        citation_errors = {}
        for _id in docs:
            # print 'Processing doc', _id
            doc = docs[_id]

            citations_from_update_history = self.get_citations_from_data(doc)
            current_citations = self.get_citations(doc)

            if len(current_citations) != len(citations_from_update_history):
                citation_errors[_id] = citations_from_update_history

                print _id, 'current citations:', len(
                    current_citations), 'citations from update history:', len(
                        citations_from_update_history)

        file_utils.save_file(self.batch_docs_directory,
                             'citation_errors_' + self.batch_name + '.json',
                             citation_errors)
예제 #27
0
    def run(self):
        # self.get_updated_docs()
        self.updated_docs = file_utils.load_file(
            self.load_config.other_files_directory(), 'updated_docs.json')
        print 'Updated docs:', len(self.updated_docs)
        print 'Original docs:', len(self.original_docs)

        # self.get_original_docs()
        # sys.exit(1)

        self.original_docs = file_utils.load_file(
            self.load_config.other_files_directory(), 'original_docs.json')
        self.inverted_index = file_utils.load_file(
            self.load_config.other_files_directory(), 'inverted_index.json')
        self.inverted_index_for_updated_docs = file_utils.load_file(
            self.load_config.other_files_directory(),
            'inverted_index_for_updated_docs.json')

        print 'Updated docs:', len(self.updated_docs)
        print 'Original docs:', len(self.original_docs)
        print 'Inverted index:', len(self.inverted_index)
        print 'inverted_index_for_updated_docs:', len(
            self.inverted_index_for_updated_docs)
        # print json.dumps(self.inverted_index_for_updated_docs)
        # input = raw_input('Continue?')
        # if input.lower() in ['n', 'no', '0']:
        #     sys.exit(1)

        self.update_docs()

        print 'Docs with updates', len(self.docs_with_updates)
        # print json.dumps(self.docs_with_updates)

        print 'Missing docs'
        print json.dumps(self.missing_docs.keys())

        file_utils.save_file(self.load_config.other_files_directory(),
                             'docs_with_updates.json', self.docs_with_updates)
예제 #28
0
    def send_notifications(self, prospects):
        load_config = self.get_load_config()
        email_client = EmailClient(load_config)
        # print 'Prospects', all_prospects
        # self.get_logger().info('Prospects ' + str(prospects))
        failed_prospects = []

        local_date = self.local_date_time.split(' ')[0]

        # Send email notifications
        for prospect in prospects:
            problems = email_client.send_notification_for_prospect(
                prospect, local_date)
            if len(problems) > 0:
                failed_prospects.append({
                    'problems': problems,
                    'prospect': prospect
                })

        # Dump failed prospects to file
        if len(failed_prospects) > 0:
            file_utils.save_file(
                self.get_load_config().other_files_directory(),
                'failed_prospects.json', failed_prospects)
예제 #29
0
    def load(self):
        generated_files_directory = self.load_config.data_source_directory()
        print 'Processing batch', self.batch_file_name

        processes_ids_file_name = 'processed_ids_' + self.batch_file_name
        ids_to_update_file_name = 'ids_to_update_' + self.batch_file_name

        # Get processed ids
        processed_ids = file_utils.load_file(generated_files_directory,
                                             processes_ids_file_name)
        if processed_ids is None or len(processed_ids) == 0:
            print 'Processed ids file not found, aborting...'
            return

        # Get batch ids
        batch_ids = file_utils.load_file(generated_files_directory,
                                         self.batch_file_name)
        if batch_ids is None or len(batch_ids) == 0:
            print 'batch ids not found, aborting....'
            return

        # Continue processing
        print 'Processed ids count:', len(processed_ids), ' ~ ', len(batch_ids)

        if len(processed_ids) != len(batch_ids):
            print 'Processing not finished, aborting...'
            return
        else:
            print 'Processing complete for', self.batch_file_name, ', proceeding with data load...'

        ids_to_update = file_utils.load_file(generated_files_directory,
                                             ids_to_update_file_name)

        # Get the loaded ids
        loaded_ids = self.get_loaded_ids(self.reports_directory)

        total_count = len(ids_to_update)
        count = 0

        filtered_ids = []
        for _id in ids_to_update:
            if _id not in loaded_ids:
                filtered_ids.append(_id)

        print 'Ids to update:', len(filtered_ids)

        reformatted_array = {}
        for _id in filtered_ids:
            count += 1

            derwent_ids = ids_to_update[_id]

            uspto_ids = []
            for derwent_id in derwent_ids:
                uspto_id = derwent_id.replace('DP', '')
                if uspto_id in self.uspto_ids:
                    uspto_ids.append(uspto_id)

            if _id not in reformatted_array:
                reformatted_array[_id] = []

            if len(derwent_ids) > 0:
                relationship = {
                    'index_id': ID_DERWENT_PATENTS,
                    'ids': derwent_ids,
                    'type': RELATIONSHIP_TYPE_RELATIONS
                }

                reformatted_array[_id].append(relationship)

            if len(uspto_ids) > 0:
                relationship = {
                    'index_id': ID_USPTO,
                    'ids': uspto_ids,
                    'type': RELATIONSHIP_TYPE_RELATIONS
                }

                reformatted_array[_id].append(relationship)

            # print 'Reformatted ids', reformatted_array
            # time.sleep(10)
            # if len(uspto_ids) > 0:
            #     print _id, len(derwent_ids)
            #     print derwent_ids
            #     print uspto_ids
            #     time.sleep(4)

            # print extended_relations

        if len(reformatted_array) > 0:
            self.load_ids(reformatted_array)
            file_utils.save_file(self.reports_directory,
                                 'loaded_ids_' + self.batch_file_name,
                                 reformatted_array.keys())
예제 #30
0
def set_downloaded_files(other_files_directory, downloaded_files):
    file_utils.save_file(other_files_directory, USPTO_DOWNLOADED_FILES,
                         downloaded_files)