Exemplo n.º 1
0
    def execute(self, context):
        es = reach.elastic.common.connect(self.es_host, self.es_port)
        s3 = WellcomeS3Hook()

        # TODO: implement skipping mechanism
        fulltext_docs.clean_es(es, self.es_index, self.organisation)

        if self.max_items:
            self.log.info(
                'Getting %s pubs from %s',
                self.max_items,
                self.src_s3_key,
            )
        else:
            self.log.info(
                'Getting %s pubs from %s',
                'all',
                self.src_s3_key,
            )

        s3_object = s3.get_key(self.src_s3_key)
        with tempfile.NamedTemporaryFile() as tf:
            s3_object.download_fileobj(tf)
            tf.seek(0)
            count = fulltext_docs.insert_file(
                tf,
                es,
                self.organisation,
                max_items=self.max_items,
                es_index=self.es_index,
            )
        self.log.info('import complete count=%d', count)
Exemplo n.º 2
0
    def execute(self, context):
        with safe_import():
            from reach.refparse.refparse import yield_structured_references

        pool_map = map
        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        with tempfile.NamedTemporaryFile(
        ) as split_rawf, tempfile.NamedTemporaryFile() as parsed_rawf:
            with gzip.GzipFile(mode='wb',
                               fileobj=split_rawf) as split_f, gzip.GzipFile(
                                   mode='wb', fileobj=parsed_rawf) as parsed_f:
                refs = yield_structured_references(self.src_s3_key, pool_map,
                                                   logger)
                for split_references, parsed_references in refs:
                    split_f.write(json.dumps(split_references).encode('utf-8'))
                    split_f.write(b'\n')
                    for ref in parsed_references:
                        parsed_f.write(json.dumps(ref).encode('utf-8'))
                        parsed_f.write(b'\n')

            split_rawf.flush()
            parsed_rawf.flush()

            s3.load_file(
                filename=split_rawf.name,
                key=self.split_s3_key,
                replace=True,
            )

            s3.load_file(
                filename=parsed_rawf.name,
                key=self.parsed_s3_key,
                replace=True,
            )
Exemplo n.º 3
0
    def execute(self, context):
        with safe_import():
            from reach.refparse.refparse import exact_match_publication

        publications_path = 's3://{path}'.format(path=self.publications_path, )
        exact_matched_references_path = 's3://{path}'.format(
            path=self.exact_matched_references_path, )

        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        exact_matcher = ElasticsearchExactMatcher(self.es,
                                                  self.title_length_threshold)

        with tempfile.NamedTemporaryFile(mode='wb') as output_raw_f:
            with gzip.GzipFile(mode='wb', fileobj=output_raw_f) as output_f:
                publications = yield_publications(s3, publications_path)
                for publication in publications:
                    exact_matched_references = exact_match_publication(
                        exact_matcher, publication)
                    for exact_matched_reference in exact_matched_references:
                        if exact_matched_reference:
                            logger.info("Match")

                            output_f.write(
                                json.dumps(exact_matched_reference).encode(
                                    'utf-8'))
                            output_f.write(b'\n')

            output_raw_f.flush()

            s3.load_file(
                filename=output_raw_f.name,
                key=exact_matched_references_path,
                replace=True,
            )
Exemplo n.º 4
0
    def execute(self, context):

        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        results = []

        # Read data from S3
        gold = _read_json_gz_from_s3(s3, self.gold_s3_key)
        reach = _read_json_gz_from_s3(s3, self.reach_s3_key)

        evaluator = ReachEvaluator(gold, reach)
        eval_results = evaluator.eval()

        # Add additional metadata

        eval_results['gold_refs'] = self.gold_s3_key
        eval_results['reach_refs'] = self.reach_s3_key
        eval_results['reach_params'] = self.reach_params

        # Write the results to S3
        _write_json_gz_to_s3(s3, [eval_results], key=self.dst_s3_key)

        self.log.info(
            'EvaluateOperator: Finished Evaluating Reach matches'
        )
Exemplo n.º 5
0
    def execute(self, context):
        # Initialise settings for a limited scraping
        logger.info("Deciding on policy title")
        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        results = []

        with tempfile.TemporaryFile(mode='rb+') as tf:
            key = s3.get_key(self.src_s3_key)
            key.download_fileobj(tf)
            tf.seek(0)
            with gzip.GzipFile(mode='rb', fileobj=tf) as f:
                for line in f:
                    data = json.loads(line)
                    source_meta = data.get("source_metadata", {})
                    pdf_meta = data.get("pdf_metadata", {})
                    p_name = PolicyNameCandidates(data)
                    results.append(
                        json.dumps({
                            'file_hash':
                            data.get("file_hash"),
                            'keywords':
                            data.get('keywords', {}),
                            'text':
                            data.get('text', ''),
                            'sections':
                            data.get('sections', []),
                            'url':
                            source_meta.get("url", None),
                            'source_page':
                            source_meta.get('source_page', None),
                            'title':
                            p_name.get_title(),
                            'authors':
                            source_meta.get("authors", None),
                            'year':
                            source_meta.get("year", None),
                            'subjects':
                            source_meta.get("subjects", None),
                            'created':
                            pdf_meta.get("created", None),
                            'types':
                            source_meta.get("types", None)
                        }))
        # Write the results to S3
        with tempfile.NamedTemporaryFile(mode='wb') as output_raw_f:
            with gzip.GzipFile(mode='wb', fileobj=output_raw_f) as output_f:
                for item in results:
                    output_f.write(item.encode("utf-8"))
                    output_f.write(b"\n")

            output_raw_f.flush()
            s3.load_file(filename=output_raw_f.name,
                         key=self.dst_s3_key,
                         replace=True)
            logger.info(
                'PolicyNameNormalizerOperator: Done normalizing policy names')
Exemplo n.º 6
0
    def execute(self, context):
        with safe_import():
            from reach.refparse.refparse import fuzzy_match_reference

        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        fuzzy_matcher = ElasticsearchFuzzyMatcher(
            self.es,
            self.score_threshold,
            self.should_match_threshold,
            self.es_index,
            self.organisation,
        )
        refs = yield_structured_references(s3, self.src_s3_key)
        match_count = 0
        count = 0
        references = {}
        for count, structured_reference in enumerate(refs, 1):
            if count % 500 == 0:
                logger.info('FuzzyMatchRefsOperator: references=%d', count)
            fuzzy_matched_reference = fuzzy_match_reference(
                fuzzy_matcher, structured_reference)
            if fuzzy_matched_reference:
                ref_id = fuzzy_matched_reference['reference_id']
                if ref_id in references.keys():

                    references[ref_id]['associated_policies_count'] += 1

                    references[ref_id]['policies'].append(
                        fuzzy_matched_reference['policies'][0])
                else:
                    references[ref_id] = fuzzy_matched_reference

                match_count += 1
                if match_count % 100 == 0:
                    logger.info('FuzzyMatchRefsOperator: matches=%d',
                                match_count)

        with tempfile.NamedTemporaryFile(mode='wb') as output_raw_f:
            with gzip.GzipFile(mode='wb', fileobj=output_raw_f) as output_f:
                for reference in references.values():
                    output_f.write(json.dumps(reference).encode('utf-8'))
                    output_f.write(b'\n')

            output_raw_f.flush()
            s3.load_file(
                filename=output_raw_f.name,
                key=self.dst_s3_key,
                replace=True,
            )
            logger.info('FuzzyMatchRefsOperator: references=%d matches=%d',
                        count, match_count)

            logger.info('FuzzyMatchRefsOperator: Matches saved to %s',
                        s3.get_key(self.dst_s3_key))
Exemplo n.º 7
0
    def execute(self, context):
        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        results = []

        # Download and open the two annotated data files.

        refs = _read_json_gz_from_s3(s3, self.refs_s3_key)
        titles = _read_json_gz_from_s3(s3, self.titles_s3_key)

        self.log.info(
            'AddDocidToTitleAnnotation read %d lines from %s',
            len(refs),
            self.refs_s3_key
        )

        self.log.info(
            'AddDocidToTitleAnnotations read %d lines from %s',
            len(titles),
            self.titles_s3_key
        )

        # Create lookup dict mapping input_hash to meta data

        metas = {doc.get('_input_hash'):doc.get('meta') for doc in refs}
        annotated_with_meta = []

        for doc in titles:
            doc['meta'] = metas.get(doc['_input_hash'])
            annotated_with_meta.append(doc)

        _write_json_gz_to_s3(s3, annotated_with_meta, key=self.dst_s3_key)

        self.log.info(
            'AddDocidToTitleAnnotations wrote %d lines to %s.',
            len(annotated_with_meta),
            self.dst_s3_key
        )

        self.log.info(
            'AddDocidToTitleAnnotations: Done extracting refs from '
            'annotated data.'
        )
Exemplo n.º 8
0
    def execute(self, context):
        with safe_import():
            from reach.refparse.refparse import fuzzy_match_reference

        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        fuzzy_matcher = ElasticsearchFuzzyMatcher(
            self.es,
            self.score_threshold,
            self.should_match_threshold,
            self.es_index,
        )

        with tempfile.NamedTemporaryFile(mode='wb') as output_raw_f:
            with gzip.GzipFile(mode='wb', fileobj=output_raw_f) as output_f:
                refs = yield_structured_references(s3, self.src_s3_key)
                match_count = 0
                count = 0
                for count, structured_reference in enumerate(refs, 1):
                    if count % 500 == 0:
                        logger.info('FuzzyMatchRefsOperator: references=%d',
                                    count)
                    fuzzy_matched_reference = fuzzy_match_reference(
                        fuzzy_matcher, structured_reference)
                    if fuzzy_matched_reference:
                        match_count += 1
                        if match_count % 100 == 0:
                            logger.info('FuzzyMatchRefsOperator: matches=%d',
                                        match_count)
                        output_f.write(
                            json.dumps(fuzzy_matched_reference).encode(
                                'utf-8'))
                        output_f.write(b'\n')

            output_raw_f.flush()
            s3.load_file(
                filename=output_raw_f.name,
                key=self.dst_s3_key,
                replace=True,
            )
            logger.info('FuzzyMatchRefsOperator: references=%d matches=%d',
                        count, match_count)
Exemplo n.º 9
0
    def execute(self, context):
        es = reach.elastic.common.connect(
            self.es_hosts)
        s3 = WellcomeS3Hook()

        # TODO: implement skipping mechanism
        epmc_metadata.clean_es(es, self.es_index)

        self.log.info(
            'Getting %s pubs from %s',
            self.max_epmc_metadata if self.max_epmc_metadata else 'all',
            self.src_s3_key,
        )

        if self.src_s3_key.startswith("file://"):
            # Use a local file
            file_path = self.src_s3_key.replace("file:/", "")
            with open(file_path, "rb") as f:
                count = epmc_metadata.insert_file(
                    f,
                    es,
                    max_items=self.max_epmc_metadata,
                    es_index=self.es_index
                )
        else:
            # DOwnload S3 object
            s3_object = s3.get_key(self.src_s3_key)
            with tempfile.NamedTemporaryFile() as tf:
                s3_object.download_fileobj(tf)
                tf.seek(0)
                count = epmc_metadata.insert_file(
                    tf,
                    es,
                    max_items=self.max_epmc_metadata,
                    es_index=self.es_index,
                )

        self.log.info('execute: insert complete count=%d', count)
Exemplo n.º 10
0
    def execute(self, context):
        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        fuzzy_matches = _get_fuzzy_matches(s3,
            self.src_s3_dir_key, self.organisations)

        self.log.info(
            'CombineReachFuzzyMatchesOperator: read %d lines from %s files',
            len(fuzzy_matches),
            len(self.organisations),
            )

        # Write the results to S3

        _write_json_gz_to_s3(s3, fuzzy_matches, key=self.dst_s3_key)

        self.log.info(
            'CombineReachFuzzyMatchesOperator: wrote %d lines to %s.',
            len(fuzzy_matches),
            self.dst_s3_key
        )
        self.log.info(
            'CombineReachFuzzyMatchesOperator: Done combining reach fuzzy matches.'
        )
Exemplo n.º 11
0
    def execute(self, context):
        es = reach.elastic.common.connect(self.es_host, self.es_port)
        s3 = WellcomeS3Hook()

        # TODO: implement skipping mechanism
        epmc_metadata.clean_es(es, self.es_index)

        self.log.info(
            'Getting %s pubs from %s',
            self.max_epmc_metadata if self.max_epmc_metadata else 'all',
            self.src_s3_key,
        )

        s3_object = s3.get_key(self.src_s3_key)
        with tempfile.NamedTemporaryFile() as tf:
            s3_object.download_fileobj(tf)
            tf.seek(0)
            count = epmc_metadata.insert_file(
                tf,
                es,
                max_items=self.max_epmc_metadata,
                es_index=self.es_index,
            )
        self.log.info('execute: insert complete count=%d', count)
Exemplo n.º 12
0
    def execute(self, context):
        s3 = WellcomeS3Hook(aws_conn_id=self.aws_conn_id)

        results = []

        # Download and open the two annotated data files.

        annotated_with_meta = _read_json_gz_from_s3(s3, self.src_s3_key)

        self.log.info(
            'ExtractRefsFromGoldDataOperator read %d lines from %s',
            len(annotated_with_meta),
            self.src_s3_key
        )

        # Create lookup dict mapping input_hash to meta data

        annotated_titles = []

        for doc in annotated_with_meta:
            doc_hash = None
            meta = doc.get('meta', dict())

            # Get metadata if it exists (this will contain the document hash -
            # the unique id for the downloaded document assigned by Reach.

            if meta:
                doc_hash = meta.get('doc_hash')

                # Only add the reference if there is a doc_hash, if not the
                # reference is not useful for evaluation. This may occur when
                # it was not possible to reconcile the _input_hash from the
                # title annotation with the _input_hash from the reference
                # annotation which contains the full metadata. Going forward
                # this should not occur if the examples annotated for titles
                # are drawn from those annotated for references.

                if doc_hash:
                    spans = doc.get('spans')

                    # Get spans, and create references from them. Note that
                    # these spans need to be TITLE, i.e. reference level spans,
                    # not individual token level spans! This will create a 
                    # dict index by title with a list of doc_hashes in which
                    # those titles were found.

                    if spans:
                        for span in spans:
                            title = _get_span_text(doc['text'], span)
                            annotated_titles.append(
                                {
                                    'document_id': doc_hash,
                                    'Title': title,
                                    'metadata': {'file_hash': doc_hash},
                                    'reference_id': hash(title)
                                }
                            )

        _write_json_gz_to_s3(s3, annotated_titles, key=self.dst_s3_key)

        self.log.info(
            'ExtractRefsFromGoldDataOperator wrote %d lines to %s.',
            len(annotated_titles),
            self.dst_s3_key
        )

        self.log.info(
            'ExtractRefsFromGoldDataOperator: Done extracting refs from '
            'annotated data.'
        )