Exemplo n.º 1
0
    def __init__(self):
        self.missing_ids = {}
        self.new_ids = {}
        self.data_utils = DataUtils()
        self.data_loader_utils = DataLoaderUtils(SERVER, OLD_INDEX, OLD_TYPE,
                                                 '', '')

        self.docs_for_dolan = {}
    def __init__(self, ct_load_config):
        self.ct_load_config = ct_load_config
        self.pubmed_load_config = self.get_pubmed_load_config()

        self.pubmed_relations = {}
        self.ct_relations = {}

        self.processed_docs = 0
        self.data_utils = DataUtils()
Exemplo n.º 3
0
    def __init__(self, server, index, type):
        self.server = server
        self.src_index = src_index
        self.src_type = src_type

        self.docs_with_issues = {}
        self.processed_docs = 0

        self.data_utils = DataUtils()
Exemplo n.º 4
0
 def batch_fetch_docs(self, ids, index_id):
     data_utils = DataUtils()
     if index_id == ID_IRDB:
         data_utils.batch_fetch_docs_for_ids(LOCAL_SERVER, ids, INDEX, TYPE,
                                             self.docs_fetched_irdb, 1000)
     elif index_id == ID_PUBMED:
         data_utils.batch_fetch_docs_for_ids(
             SERVER, ids, INDEX_MAPPING[index_id]['index'],
             INDEX_MAPPING[index_id]['type'], self.docs_fetched_pubmed,
             1000)
Exemplo n.º 5
0
    def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password):
        self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type)
        self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type)

        self.data_utils = DataUtils()

        self.username = username
        self.password = password

        file_utils.make_directory(TEMP_DIR)
Exemplo n.º 6
0
    def __init__(self, src_server, dest_server, src_index, src_type, dst_index,
                 dst_type):
        self.src_data_loader_utils = DataLoaderUtils(src_server, src_index,
                                                     src_type)
        self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index,
                                                      dst_type)

        self.processed_doc_count = 0
        self.total_doc_count = 0

        self.data_utils = DataUtils()
Exemplo n.º 7
0
    def __init__(self, server, src_index, src_type, process_doc_method):
        self.server = server
        self.index = src_index
        self.type = src_type
        self.process_doc_method = process_doc_method

        self.batch_size = 5000
        self.process_count = 2
        self.process_spawn_delay = 0.15
        self.bulk_data_size = 300000

        self.data_loader_utils = DataLoaderUtils(self.server, self.index,
                                                 self.type)

        self.data_utils = DataUtils()
    def __init__(self, load_config, data_source, data_source_summary):
        super(PubmedRelationshipProcessor,
              self).__init__(load_config, data_source)
        self.data_source_summary = data_source_summary
        self.data_loader_utils = DataLoaderUtils(
            self.load_config.server, self.load_config.index,
            self.load_config.type, self.load_config.server_username,
            self.load_config.server_password)
        self.load_relationships = True

        self.docs_with_new_citations = {}
        self.docs_citations_history = {}

        self.existing_docs = {}

        self.data_utils = DataUtils()
    def run(self):
        doc_ids = get_doc_ids(
            server=self.load_config.server,
            src_index=self.load_config.index,
            src_type=self.load_config.type,
            dest_dir=self.load_config.other_files_directory(),
            dest_file_name="INITIAL_GRANT_ALL_IRDB_IDS.json")

        doc_ids = doc_ids.keys()

        self.total_doc_count = len(doc_ids)

        data_utils = DataUtils()
        data_utils.batch_fetch_docs_for_ids(base_url=self.load_config.server,
                                            ids=doc_ids,
                                            index=self.load_config.index,
                                            type=self.load_config.type,
                                            docs_fetched=self.docs_fetched)

        self.process_grant_num_groups()
Exemplo n.º 10
0
    def __init__(self, src_server, dest_server, src_index, src_type, dst_index,
                 dst_type, username, password):
        self.src_data_loader_utils = DataLoaderUtils(src_server, src_index,
                                                     src_type)
        self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index,
                                                      dst_type)

        self.processed_doc_count = 0
        self.total_doc_count = 0

        self.data_utils = DataUtils()

        self.relations_to_exclude = []
        self.missing_destination_ids = []

        self.username = username
        self.password = password

        self.last_time_stamp = 0
        self.diff_average = 0
Exemplo n.º 11
0
 def __init__(self, batch_docs_directory, load_config, batch_name):
     self.load_config = load_config
     # self.batch = batch
     self.batch_docs_directory = batch_docs_directory
     self.batch_name = batch_name
     self.data_utils = DataUtils()
Exemplo n.º 12
0
 def __init__(self, load_config):
     super(FixCitedBys, self).__init__(load_config, batch_doc_count=5000, multiprocess=True)
     self.load_config = load_config
     self.data_utils = DataUtils()
Exemplo n.º 13
0
    def process_id(self, _id):
        grant_numbers = []
        derwent_ids = []
        if _id in self.irdb_docs:
            doc = self.irdb_docs[_id]
            if doc is not None:
                admin_phs_org_code = None
                if 'admin_phs_org_code' in doc:
                    admin_phs_org_code = doc['admin_phs_org_code']

                serial_num = None
                if 'serial_num' in doc:
                    serial_num = doc['serial_num']

                if admin_phs_org_code is not None and serial_num is not None:
                    grant_number = admin_phs_org_code + '' + serial_num
                    grant_numbers.append(grant_number)

                    grant_number = admin_phs_org_code + '-' + serial_num
                    grant_numbers.append(grant_number)

                    grant_number = admin_phs_org_code + '0' + serial_num
                    grant_numbers.append(grant_number)

                    grant_number = admin_phs_org_code + '-0' + serial_num
                    grant_numbers.append(grant_number)

                    grant_number = admin_phs_org_code + ' ' + serial_num
                    grant_numbers.append(grant_number)

                    grant_number = admin_phs_org_code + ' 0' + serial_num
                    grant_numbers.append(grant_number)

        if len(grant_numbers) > 0:
            should_query = []
            for grant_number in grant_numbers:
                match_phrase_query = {
                    "match_phrase": {
                        "government_support": grant_number
                    }
                }

                should_query.append(match_phrase_query)

            query = {"bool": {"should": should_query}}

            data_utils = DataUtils(self.session)
            derwent_ids = data_utils.batch_fetch_ids_for_query(
                base_url=SERVER,
                query=query,
                index=INDEX_MAPPING[ID_DERWENT_PATENTS]['index'],
                type=INDEX_MAPPING[ID_DERWENT_PATENTS]['type'])

            # if len(derwent_ids) > 0:
            #     print _id, len(derwent_ids)

            #     if len(derwent_ids) < 5:
            #         print derwent_ids
            #     time.sleep(5)

        return derwent_ids