示例#1
0
    def transform(self):
        xylose_source = self.clean_for_xylose()
        xylose_issue = Issue(xylose_source)

        # jid
        uuid = self.extract_model_instance.uuid
        self.transform_model_instance['uuid'] = uuid
        self.transform_model_instance['iid'] = uuid

        # created
        self.transform_model_instance['created'] = datetime.now()

        # updated
        self.transform_model_instance['updated'] = datetime.now()

        # unpublish_reason -> vazio

        # journal
        acronym = xylose_issue.journal.acronym
        try:
            journal = TransformJournal.objects.get(acronym=acronym)
        except Exception, e:
            # se não for encontrado, salvamos o code do Issue para processar depois
            logger.error(u"TransformJournal (acronym: %s) não encontrado!")
            raise e
示例#2
0
def requestissue(config, issue_pid):
    # Request Issue
    # https://articlemeta.scielo.org/api/v1/issue/?code=0104-070720190001
    uissue = config['articlemeta'][
        'host'] + '/api/v1/issue/?code=%s' % issue_pid
    logger.info(uissue)

    xissue = None
    while xissue is None:
        try:
            rissue = requests.get(uissue)
            xissue = Issue(rissue.json())
        except requests.exceptions.Timeout:
            logger.info('error: %s' % e)
            print("Timeout - Try again")
            leave()
        except requests.exceptions.RequestException as e:
            logger.info('error: %s' % e)
            print("Request Error - Check your connection and try again")
            leave()
        except json.decoder.JSONDecodeError as e:
            logger.info('error: %s' % e)
            print("Request Error - Try again")
            leave()

    # Valid Codes list
    seccode_list = []
    if xissue.sections != None:
        for i, sec in enumerate(list(xissue.sections.values())):
            for sectext in list(sec.values()):
                if sectext not in invalid_sec and list(
                        xissue.sections)[i] not in seccode_list:
                    seccode_list.append(list(xissue.sections)[i])

    return (xissue, seccode_list)
def process_issues(**context):
    """Processa uma lista de issues carregadas a partir do resultado
    de leitura da base MST"""
    def filter_issues(issues: List[Issue]) -> List[Issue]:
        """Filtra as issues em formato xylose sempre removendo
        os press releases e ahead of print"""

        filters = [
            lambda issue: not issue.type == "pressrelease",
            lambda issue: not issue.type == "ahead",
        ]

        for f in filters:
            issues = list(filter(f, issues))

        return issues

    issues = context["ti"].xcom_pull(task_ids="read_issue_mst")
    issues = json.loads(issues)
    issues = [Issue({"issue": data}) for data in issues]
    issues = filter_issues(issues)
    issues_as_kernel = [issue_as_kernel(issue) for issue in issues]

    for issue in issues_as_kernel:
        _id = issue.pop("_id")
        response = register_or_update(_id, issue, KERNEL_API_BUNDLES_ENDPOINT)
def ext_issue(code, **ext_params):

    issue = request.get(
        "%s/issue" % config.AM_URL_API,
        params={"collection": config.get("SCIELO_COLLECTION"), "code": code},
    ).json()
    obj_issue = Issue(issue)
示例#5
0
 def test_issue_has_number_returns(self):
     issue_json = self.issue_json.copy()
     issue_json["v32"] = [{"_": "ahead"}]
     issue = Issue({"issue": issue_json})
     _issue = issue_to_kernel(issue)
     self.assertEqual("ahead", issue.number)
     self.assertEqual("2448-167X-aop", _issue["id"])
 def test_issue_data_to_link_returns_issue_data_to_link_to_journal(self):
     issue = Issue({"issue": self.issues[-1]})
     result = issue_data_to_link(issue)
     self.assertEqual(result["id"], "1678-4464-2018-v1-n1")
     self.assertEqual(result["number"], "1")
     self.assertEqual(result["volume"], "1")
     self.assertEqual(result["year"], "2018")
示例#7
0
    def check(self, metadata):
        """Enriquece e normaliza itens do dicionário ``metadata``, que representa
        metadados de um fascículo.

        A estrutura de ``metadata`` é a mesma retornada pelo formato JSON, do
        ``articlemeta.scielo.org``, conforme exemplo:
        https://gist.github.com/gustavofonseca/4a5919db8d0027f37522da7d06bfa876
        """
        metadata_copy = metadata.copy()
        issue = Issue(metadata_copy)

        issns = set([
            issue.journal.any_issn(priority=u'electronic'),
            issue.journal.any_issn(priority=u'print'),
            issue.journal.scielo_issn
        ])

        metadata_copy['code'] = issue.publisher_id
        metadata_copy['code_title'] = list(issns)
        metadata_copy['collection'] = issue.collection_acronym
        metadata_copy['issue_type'] = issue.type
        metadata_copy['publication_year'] = issue.publication_date[0:4]
        metadata_copy['publication_date'] = issue.publication_date

        if not isinstance(issue.data['issue']['processing_date'], datetime):
            try:
                metadata_copy['processing_date'] = datetime.strptime(
                    issue.data['issue']['processing_date'], '%Y-%m-%d')
            except:
                metadata_copy['processing_date'] = datetime.now()

        return metadata_copy
 def test_should_should_include_electronic_issn(self):
     self.issue_json["v435"] = [{"t": "ONLIN", "_": "10000-000A"}]
     issue = Issue({"issue": self.issue_json})
     issns = get_journal_issns_from_issue(issue)
     issns.sort()
     expected = ["0001-3714", "10000-000A"]
     self.assertEqual(expected, issns)
示例#9
0
    def _check_issue_meta(self, metadata):
        """
            This method will check the given metadata and retrieve
            a new dictionary with some new fields.
        """

        issue = Issue(metadata)

        issns = set([
            issue.journal.any_issn(priority=u'electronic'),
            issue.journal.any_issn(priority=u'print'),
            issue.journal.scielo_issn
        ])

        metadata['code'] = issue.publisher_id
        metadata['code_title'] = list(issns)
        metadata['collection'] = issue.collection_acronym
        metadata['issue_type'] = issue.type
        metadata['publication_year'] = issue.publication_date[0:4]
        metadata['publication_date'] = issue.publication_date
        metadata['_shard_id'] = uuid.uuid4().hex

        try:
            metadata['processing_date'] = issue.processing_date
        except:
            metadata['processing_date'] = datetime.now().date().isoformat()

        return metadata
示例#10
0
    def issue(self, code, collection, replace_journal_metadata=True):
        try:
            issue = self.client.get_issue(code=code,
                                          collection=collection,
                                          replace_journal_metadata=True)
        except self.ARTICLEMETA_THRIFT.ServerError as e:
            msg = 'Error retrieving issue: %s_%s' % (collection, code)
            raise ServerError(msg)

        if not issue:
            logger.warning('Issue not found for: %s_%s' % (collection, code))
            return None

        jissue = None
        try:
            jissue = json.loads(issue)
        except:
            msg = 'Fail to load JSON when retrienving document: %s_%s' % (
                collection, code)
            raise ValueError(msg)

        xissue = Issue(jissue)
        logger.info('Issue loaded: %s_%s' % (collection, code))

        return xissue
    def setUp(self):
        self.issue_json = {
            "v65": [{
                "_": "20190129"
            }],
            "v35": [{
                "_": "2448-167X"
            }]
        }

        self._issue = Issue({"issue": self.issue_json})
        self.issue = issue_to_kernel(self._issue)
示例#12
0
    def issue(self, code, collection):

        url = urljoin(self.ARTICLEMETA_URL, self.ISSUE_ENDPOINT)

        params = {'collection': collection, 'code': code}

        result = self._do_request(url, params)

        if not result:
            return None

        xresult = Issue(result)

        return xresult
 def test_issue_data_to_link_with_supplement(self):
     suppl_field_expected = (
         ("v131", u"2", "2"),
         ("v132", u"2", "2"),
         ("v131", u"0", "0"),
         ("v132", u"0", "0"),
     )
     data = self.issues[-1]
     for field, value, expected in suppl_field_expected:
         with self.subTest(field=field, value=value, expected=expected):
             data[field] = [{u"_": value}]
             issue = Issue({"issue": data})
             result = issue_data_to_link(issue)
             self.assertEqual(result["supplement"], expected)
示例#14
0
def process_issues(**context):
    """Processa uma lista de issues carregadas a partir do resultado
    de leitura da base MST"""

    issue_json_path = context["ti"].xcom_pull(
        task_ids="copy_mst_bases_to_work_folder_task", key="issue_json_path")

    with open(issue_json_path, "r") as f:
        issues = f.read()
        logging.info("reading file from %s." % (issue_json_path))

    issues = json.loads(issues)
    issues = [Issue({"issue": data}) for data in issues]
    issues = filter_issues(issues)
    issues_as_kernel = [issue_as_kernel(issue) for issue in issues]

    for issue in issues_as_kernel:
        _id = issue.pop("_id")
        register_or_update(_id, issue, KERNEL_API_BUNDLES_ENDPOINT)
示例#15
0
    def issue(self, code, collection, replace_journal_metadata=True):

        try:
            issue = self.client.get_issue(code=code,
                                          collection=collection,
                                          replace_journal_metadata=True)
        except:
            msg = 'Error retrieving issue: %s_%s' % (collection, code)
            raise ServerError(msg)

        try:
            jissue = json.loads(issue)
        except:
            msg = 'Fail to load JSON when retrienving document: %s_%s' % (
                collection, code)
            raise ServerError(msg)

        xissue = Issue(jissue)

        logger.info('Issue loaded: %s_%s' % (collection, code))
        return xissue
示例#16
0
    def issue(self, code, collection, replace_journal_metadata=True):
        issue = self.dispatcher('get_issue',
                                code=code,
                                collection=collection,
                                replace_journal_metadata=True)

        if not issue:
            logger.info('Issue not found for: %s_%s', collection, code)
            return None

        jissue = None
        try:
            jissue = json.loads(issue)
        except:
            msg = 'Fail to load JSON when retrienving document: %s_%s' % (
                collection, code)
            raise ValueError(msg)

        xissue = Issue(jissue)
        logger.info('Issue loaded: %s_%s' % (collection, code))

        return xissue
示例#17
0
def mount_journals_issues_link(issues: List[dict]) -> dict:
    """Monta a relação entre os journals e suas issues.

    Monta um dicionário na estrutura {"journal_id": ["issue_id"]}. Issues do
    tipo ahead ou pressrelease não são consideradas. É utilizado o
    campo v35 (issue) para obter o `journal_id` ao qual a issue deve ser relacionada.

    :param issues: Lista contendo issues extraídas da base MST"""

    journal_issues = {}
    issues = [Issue({"issue": data}) for data in issues]
    issues = filter_issues(issues)

    for issue in issues:
        issue_to_link = issue_data_to_link(issue)
        issue_to_link["order"] = issue.data["issue"]["v36"][0]["_"]
        journal_id = issue.data.get("issue").get("v35")[0]["_"]
        journal_issues.setdefault(journal_id, [])

        if issue_to_link not in journal_issues[journal_id]:
            journal_issues[journal_id].append(issue_to_link)

    return journal_issues
示例#18
0
    def issues_bulk(self,
                    collection=None,
                    issn=None,
                    from_date=None,
                    until_date=None,
                    extra_filter=None,
                    limit=LIMIT):

        fdate = from_date or DEFAULT_FROM_DATE
        udate = until_date or datetime.today().isoformat()[:10]

        for from_date, until_date in dates_pagination(fdate, udate):
            offset = 0
            while True:
                issues = self.dispatcher('get_issues',
                                         collection=collection,
                                         issn=issn,
                                         from_date=from_date,
                                         until_date=until_date,
                                         limit=limit,
                                         offset=offset,
                                         extra_filter=extra_filter)

                if issues is None:
                    break

                issues = json.loads(issues).get('objects', [])

                if len(issues) == 0:
                    break

                for issue in issues:

                    yield Issue(issue)

                offset += limit
示例#19
0
    def issues(self,
               collection=None,
               issn=None,
               from_date=None,
               until_date=None):

        params = {'limit': 100}

        if collection:
            params['collection'] = collection

        if issn:
            params['issn'] = issn

        fdate = from_date or DEFAULT_FROM_DATE
        udate = until_date or datetime.today().isoformat()[:10]
        for from_date, until_date in dates_pagination(fdate, udate):
            params['from'] = from_date
            params['until'] = until_date
            params['offset'] = 0

            while True:
                url = urljoin(self.ARTICLEMETA_URL, self.ISSUES_ENDPOINT)
                issues = self._do_request(url, params=params)
                if issues is None:
                    break

                issues = issues.get('objects', [])

                if len(issues) == 0:
                    break

                for issue in issues:
                    yield Issue(issue)

                params['offset'] += 100
 def test_issue_data_to_link_without_number(self):
     data = self.issues[-1]
     del data["v32"]
     issue = Issue({"issue": data})
     result = issue_data_to_link(issue)
     self.assertIsNone(result.get("number"))
示例#21
0
def conversion_issues_to_xylose(issues: List[dict]) -> List[Issue]:
    """Converte uma lista de issues em formato JSON para uma
    lista de issues em formato xylose"""

    return [Issue({"issue": issue}) for issue in issues]
 def test_should_not_find_bundles_for_journal(self):
     self.issue_json["v35"] = [{"_": "0001-3714X"}]
     issues = [Issue({"issue": self.issue_json})]
     journal_issues = find_documents_bundles(SAMPLE_KERNEL_JOURNAL, issues)
     self.assertListEqual([], journal_issues)
 def test_should_link_journal_and_issues(self):
     issues = [Issue({"issue": self.issue_json})]
     journal_issues = find_documents_bundles(SAMPLE_KERNEL_JOURNAL, issues)
     self.assertEqual([SAMPLE_ISSUES_KERNEL[0]["id"]], journal_issues)
 def test_issue_data_to_link_without_supplement(self):
     issue = Issue({"issue": self.issues[-1]})
     result = issue_data_to_link(issue)
     self.assertIsNone(result.get("supplement"))
 def setUp(self):
     self.issue_json = deepcopy(SAMPLE_ISSUES_JSON[0])
     self.basic_issue = Issue({"issue": self.issue_json})
 def test_issue_data_to_link_without_volume(self):
     data = self.issues[-1]
     del data["v31"]
     issue = Issue({"issue": data})
     result = issue_data_to_link(issue)
     self.assertIsNone(result.get("volume"))
示例#27
0
 def test_issue_has_year_in_id_because_it_is_not_aop(self):
     self.issue_json["v31"] = [{"_": "21"}]
     self._issue = Issue({"issue": self.issue_json})
     self.issue = issue_to_kernel(self._issue)
     self.assertIn("2019", self.issue["id"])
     self.assertIn("2019", self.issue["_id"])