Exemplo n.º 1
0
def run(journal, num_articles):

    print "Running publication-dates version 1.1\n"

    # Setup output file, set parameters, and use brief run if testing
    writer = FileWriter(journal)

    num_volumes = 18  # 18 volumes per year
    issue = 1  # sample issue for each volume

    if len(sys.argv) > 1:
        print "Testing....."
        num_articles = 3
        num_volumes = 1

    # Sample papers accepted in previous year
    date = html.detect_start_volume()
    start_volume = date[0]
    acceptance_year = date[1]

    counter = 0

    volumes = range(start_volume - num_volumes + 1, start_volume + 1)

    for volume in reversed(volumes):

        # Go to volume/issue contents page, and extract URLs of articles
        articles = html.build_urls(journal, volume, issue)

        for num in range(1, num_articles + 1):

            # For first 'num_articles' in this volume/issue, try to extract date string from article webpage
            url = articles[num]

            try:
                date_string = html.get_date_div(url)
                counter += 1
            except:
                print "Some error occurred (URL '", url, "' not available?). Skipping."
                break

            article = Article(date_string)

            if article.get_year() == acceptance_year:
                writer.write_to_file(article)

    writer.close_file()

    return counter
Exemplo n.º 2
0
    def test_html_extracts_article_urls_from_contents_page_apjl(self):
        journal = "APJL"
        volume = self.test_vol
        issue = self.test_iss

        expectedArticleUrls = [
            "http://iopscience.iop.org/article/10.3847/2041-8213/836/2/L17",
            "http://iopscience.iop.org/article/10.3847/2041-8213/aa5dab",
            "http://iopscience.iop.org/article/10.3847/2041-8213/aa5cb0",
            "http://iopscience.iop.org/article/10.3847/2041-8213/aa5eb0",
            "http://iopscience.iop.org/article/10.3847/2041-8213/aa5dee"
        ]
        #URLs of first 5 articles in this issue

        articleUrls = html.build_urls(journal, volume, issue)
        #URLs of all articles in the issue

        self.assertEqual(expectedArticleUrls, articleUrls[:5])
Exemplo n.º 3
0
    def test_html_extracts_article_urls_from_contents_page_apj(self):
        journal = "APJ"
        volume = self.test_vol
        issue = self.test_iss

        expectedArticleUrls = [
            "http://iopscience.iop.org/article/10.3847/1538-4357/aa5be8",
            "http://iopscience.iop.org/article/10.3847/1538-4357/aa5b8b",
            "http://iopscience.iop.org/article/10.3847/1538-4357/aa5b88",
            "http://iopscience.iop.org/article/10.3847/1538-4357/836/2/152",
            "http://iopscience.iop.org/article/10.3847/1538-4357/836/2/153"
        ]
        #URLs of first 5 articles in this issue

        articleUrls = html.build_urls(journal, volume, issue)
        #URLs of all articles in the issue

        self.assertEqual(expectedArticleUrls, articleUrls[:5])