def test_check_pmid(): pmid = s3_client.check_pmid(12345) assert pmid == 'PMID12345' assert unicode_strs(pmid) pmid = s3_client.check_pmid('12345') assert pmid == 'PMID12345' assert unicode_strs(pmid) pmid = s3_client.check_pmid('PMID12345') assert pmid == 'PMID12345' assert unicode_strs(pmid)
def get_text(): full_pmid = s3_client.check_pmid(pmid) # Look for the full text (content, content_type) = s3_client.get_upload_content( pmid, force_fulltext_lookup=force_fulltext) content_path = None # Write the contents to a file if content_type is None or content is None: # No content found on S3, skipping content_source = 'content_not_found' elif content_type == 'pmc_oa_xml': content_source = 'pmc_oa_xml' content_path = os.path.join(input_dir, '%s.nxml' % pmid) elif content_type == 'pmc_auth_xml': content_source = 'pmc_auth_xml' content_path = os.path.join(input_dir, '%s.nxml' % pmid) elif content_type == 'pmc_oa_txt': content_source = 'pmc_oa_txt' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'elsevier_xml': content = elsevier_client.extract_text(content) # Couldn't get text from Elsevier XML if content is None: content_source = 'elsevier_extract_text_failure' else: content_source = 'elsevier_xml' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'txt': content_source = 'txt' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'abstract': content_source = 'abstract' content_path = os.path.join(input_dir, '%s.txt' % pmid) # Unhandled content type, skipping else: content_source = 'unhandled_content_type_%s' % content_type # If we got content, write the content to a file with the appropriate # extension if content_path: with open(content_path, 'wb') as f: # The XML string is Unicode enc = content.encode('utf-8') f.write(enc) # Return dict of results for this PMID result = { pmid: { 'content_source': content_source, 'content_path': content_path } } return result
os.chmod(base_dir, stat.S_IRWXO | stat.S_IRWXU | stat.S_IRWXG) input_dir = os.path.join(base_dir, 'input') output_dir = os.path.join(base_dir, 'output') os.makedirs(input_dir) os.makedirs(output_dir) pmids_to_read = [] # If we're re-reading no matter what, we don't have to check for existing # REACH output if force_read: pmids_to_read = pmids_in_range # Otherwise, check if we've read the PMIDs already else: for pmid in pmids_in_range: pmid = s3_client.check_pmid(pmid) (read_reach_version, read_source_text) = \ s3_client.get_reach_metadata(pmid) # Found it, same version if read_reach_version is not None and \ read_reach_version == reach_version: logger.info('%s: found same version (%s), skipping' % (pmid, read_reach_version)) # Found it, different version else: logger.info('%s: found %s, current %s; will re-read' % (pmid, read_reach_version, reach_version)) pmids_to_read.append(pmid) if not pmids_to_read: logger.info('No pmids to read!')