def gh_repos_from_metadata(metadata): abstract = metadata['abstract'] gh_from_abstract = gh_repo_from_text(abstract) # If the abstract contains a GitHub repo, return that if gh_from_abstract is not None: return {'repos': gh_from_abstract, 'source': 'abstract'} else: # Look for a pdf pdf = "%s/%s" % (pdf_dir, metadata['internal_pdf']) if os.path.isfile(pdf): # Try to get GitHub repo from pdf; will be None if not applicable gh_from_pdf = gh_repo_from_pdf(pdf) if gh_from_pdf is not None: return {'repos': gh_from_pdf, 'source': 'pdf'} else: return None else: # Return None if there is no pdf return None
def test_repo_from_pdf_1(self): pdf = "/Users/prussell/Dropbox/github_mining/articles/pdfs/Paulsen-2017-Chrom3D_ three-dimensional genome.pdf" self.assertSetEqual(gh_repo_from_pdf(pdf), {"3dgenomes/TADbit", "CollasLab/Chrom3D"})
def test_repo_from_pdf_4(self): pdf = "/Users/prussell/Dropbox/github_mining/articles/pdfs/Pimentel-2017-Differential analysis of RNA-seq.pdf" self.assertSetEqual(gh_repo_from_pdf(pdf), {"pachterlab/sleuth", "pachterlab/sleuth_paper_analysis"})
def test_repo_from_pdf_3(self): pdf = "/Users/prussell/Dropbox/github_mining/articles/pdfs/Olorin-2015-SLiMScape 3.x_ a Cytoscape 3 app f.pdf" self.assertSetEqual(gh_repo_from_pdf(pdf), {"slimsuite/SLiMScape", "F1000Research/SLiMScape", "slimsuite/SLiMSuite"})
def test_repo_from_pdf_2(self): pdf = "/Users/prussell/Dropbox/github_mining/articles/pdfs/Ewels-2016-Cluster Flow_ A user-friendly bioin.pdf" self.assertSetEqual(gh_repo_from_pdf(pdf), {"ewels/sra-explorer", "ewels/clusterflow", "ewels/labrador"})