示例#1
0
def test_check_pmid():
    pmid = s3_client.check_pmid(12345)
    assert pmid == 'PMID12345'
    assert unicode_strs(pmid)
    pmid = s3_client.check_pmid('12345')
    assert pmid == 'PMID12345'
    assert unicode_strs(pmid)
    pmid = s3_client.check_pmid('PMID12345')
    assert pmid == 'PMID12345'
    assert unicode_strs(pmid)
示例#2
0
def test_check_pmid():
    pmid = s3_client.check_pmid(12345)
    assert pmid == 'PMID12345'
    assert unicode_strs(pmid)
    pmid = s3_client.check_pmid('12345')
    assert pmid == 'PMID12345'
    assert unicode_strs(pmid)
    pmid = s3_client.check_pmid('PMID12345')
    assert pmid == 'PMID12345'
    assert unicode_strs(pmid)
示例#3
0
 def get_text():
     full_pmid = s3_client.check_pmid(pmid)
     # Look for the full text
     (content, content_type) = s3_client.get_upload_content(
         pmid, force_fulltext_lookup=force_fulltext)
     content_path = None
     # Write the contents to a file
     if content_type is None or content is None:
         # No content found on S3, skipping
         content_source = 'content_not_found'
     elif content_type == 'pmc_oa_xml':
         content_source = 'pmc_oa_xml'
         content_path = os.path.join(input_dir, '%s.nxml' % pmid)
     elif content_type == 'pmc_auth_xml':
         content_source = 'pmc_auth_xml'
         content_path = os.path.join(input_dir, '%s.nxml' % pmid)
     elif content_type == 'pmc_oa_txt':
         content_source = 'pmc_oa_txt'
         content_path = os.path.join(input_dir, '%s.txt' % pmid)
     elif content_type == 'elsevier_xml':
         content = elsevier_client.extract_text(content)
         # Couldn't get text from Elsevier XML
         if content is None:
             content_source = 'elsevier_extract_text_failure'
         else:
             content_source = 'elsevier_xml'
             content_path = os.path.join(input_dir, '%s.txt' % pmid)
     elif content_type == 'txt':
         content_source = 'txt'
         content_path = os.path.join(input_dir, '%s.txt' % pmid)
     elif content_type == 'abstract':
         content_source = 'abstract'
         content_path = os.path.join(input_dir, '%s.txt' % pmid)
     # Unhandled content type, skipping
     else:
         content_source = 'unhandled_content_type_%s' % content_type
     # If we got content, write the content to a file with the appropriate
     # extension
     if content_path:
         with open(content_path, 'wb') as f:
             # The XML string is Unicode
             enc = content.encode('utf-8')
             f.write(enc)
     # Return dict of results for this PMID
     result = {
         pmid: {
             'content_source': content_source,
             'content_path': content_path
         }
     }
     return result
示例#4
0
    os.chmod(base_dir, stat.S_IRWXO | stat.S_IRWXU | stat.S_IRWXG)
    input_dir = os.path.join(base_dir, 'input')
    output_dir = os.path.join(base_dir, 'output')
    os.makedirs(input_dir)
    os.makedirs(output_dir)

    pmids_to_read = []

    # If we're re-reading no matter what, we don't have to check for existing
    # REACH output
    if force_read:
        pmids_to_read = pmids_in_range
    # Otherwise, check if we've read the PMIDs already
    else:
        for pmid in pmids_in_range:
            pmid = s3_client.check_pmid(pmid)
            (read_reach_version, read_source_text) = \
                                    s3_client.get_reach_metadata(pmid)
            # Found it, same version
            if read_reach_version is not None and \
               read_reach_version == reach_version:
                logger.info('%s: found same version (%s), skipping' %
                            (pmid, read_reach_version))
            # Found it, different version
            else:
                logger.info('%s: found %s, current %s; will re-read' %
                            (pmid, read_reach_version, reach_version))
                pmids_to_read.append(pmid)

    if not pmids_to_read:
        logger.info('No pmids to read!')
示例#5
0
    os.chmod(base_dir, stat.S_IRWXO | stat.S_IRWXU | stat.S_IRWXG)
    input_dir = os.path.join(base_dir, 'input')
    output_dir = os.path.join(base_dir, 'output')
    os.makedirs(input_dir)
    os.makedirs(output_dir)

    pmids_to_read = []

    # If we're re-reading no matter what, we don't have to check for existing
    # REACH output
    if force_read:
        pmids_to_read = pmids_in_range
    # Otherwise, check if we've read the PMIDs already
    else:
        for pmid in pmids_in_range:
            pmid = s3_client.check_pmid(pmid)
            (read_reach_version, read_source_text) = \
                                    s3_client.get_reach_metadata(pmid)
            # Found it, same version
            if read_reach_version is not None and \
               read_reach_version == reach_version:
                logger.info('%s: found same version (%s), skipping' %
                            (pmid, read_reach_version))
            # Found it, different version
            else:
                logger.info('%s: found %s, current %s; will re-read' %
                            (pmid, read_reach_version, reach_version))
                pmids_to_read.append(pmid)

    if not pmids_to_read:
        logger.info('No pmids to read!')