def test_num_fields(self): d = { 'canonical': "1998PPGeo..22..553A", 'refereed': { 'refereed': False }, 'author': ["Arnfield, A. L."], 'reads': [1, 2, 3, 4], 'download': [0, 1, 2, 3], 'citations': ['1998PPGeo..22..553A'], 'id': 11, 'reference': ["1997BoLMe..85..475M"] } m = mock_open(read_data='') m.return_value.__iter__ = lambda self: iter(self.readline, '') with patch('builtins.open', m), Processor(compute_metrics=True) as processor: Cache.get('citation')['1998PPGeo..22..553A'].append( '1998PPGeo..22..553B') Cache.get('reference')['1998PPGeo..22..553A'].append( '1998PPGeo..22..553B') met = processor._compute_metrics(d) self.assertEqual(met['citation_num'], len(d['citations'])) self.assertEqual(met['reference_num'], len(d['reference'])) self.assertEqual(met['author_num'], len(d['author'])) self.assertEqual(met['refereed_citation_num'], 0)
def test_read(self): """can we read in all the data for a bibcode""" with Processor(compute_metrics=False) as processor, patch( 'adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): d = processor._read_next_bibcode('1057wjlf.book.....C') self.assertEqual(d['canonical'], '1057wjlf.book.....C') self.assertEqual(len(d['author']), 1) self.assertEqual(d['author'], ['Chao, C']) self.assertFalse(d['citation']) self.assertEqual(d['download'], [ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 ]) self.assertFalse(d['grants']) self.assertFalse(d['ned_objects']) self.assertTrue(d['nonarticle']) self.assertEqual(d['ocrabstract'], {'ocrabstract': False}) self.assertEqual(d['private'], {'private': False}) self.assertEqual(d['pub_openaccess'], {'pub_openaccess': False}) self.assertEqual(d['readers'], ['4fc45951aa', '557ebfd055', '57fcb9018a']) self.assertEqual(d['reads'], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 6 ]) self.assertEqual(d['refereed'], {'refereed': False}) self.assertEqual( d['relevance'], { 'norm_cites': 0, 'read_count': 25, 'boost': 0.32, 'citation_count': 0 })
def test_protobuf(self): """make sure protobuf are created without an exception""" with Processor(compute_metrics=False) as processor, patch( 'adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): d = processor._read_next_bibcode('1057wjlf.book.....C') c = processor._convert(d) nonbib = NonBibRecord(**c) print('nonbib = {}'.format(nonbib))
def test_open(self): """can we open data files in the test directory""" with Processor(compute_metrics=False) as processor, patch( 'adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): self.assertEqual(len(data_files), len(processor.readers)) self.assertTrue( isinstance(processor.readers['citation'], reader.NonbibFileReader)) self.assertEqual(0, len(processor.readers))
def test_merge(self): """merge when link_type and link_sub_type match""" self.maxDiff = None d = [{"url": ["http://archive.stsci.edu/prepds/gems"], "title": ["GEMS: Galaxy Evolution from Morphologies and SEDs (Hans-Walter Rix)"], "item_count": 1, "link_type": "DATA", "link_sub_type": "MAST"}, {"url": ["https://arxiv.org/abs/astro-ph/0401427"], "title": [""], "item_count": 1, "link_type": "ESOURCE", "link_sub_type": "EPRINT_HTML"}, {"url": ["https://archive.stsci.edu/mastbibref.php?bibcode=2004ApJS..152..163R"], "title": ["MAST References (HST)"], "item_count": 1, "link_type": "DATA", "link_sub_type": "MAST"}, {"url": ["https://doi.org/10.1086%2F420885"], "title": [""], "link_type": "ESOURCE", "link_sub_type": "PUB_HTML"}] a = [{"url": ["http://archive.stsci.edu/prepds/gems", "https://archive.stsci.edu/mastbibref.php?bibcode=2004ApJS..152..163R"], "title": ["GEMS: Galaxy Evolution from Morphologies and SEDs (Hans-Walter Rix)", "MAST References (HST)"], "item_count": 2, "link_type": "DATA", "link_sub_type": "MAST"}, {"url": ["https://arxiv.org/abs/astro-ph/0401427"], "title": [""], "item_count": 1, "link_type": "ESOURCE", "link_sub_type": "EPRINT_HTML"}, {"url": ["https://doi.org/10.1086%2F420885"], "title": [""], "link_type": "ESOURCE", "link_sub_type": "PUB_HTML"}] p = Processor() p._merge_data_links(d) self.assertEqual(d, a)
def test_compute_bibgroup_facet(self): p = Processor() self.assertEqual({}, p._compute_bibgroup_facet({})) self.assertEqual({'bibgroup_facet': ['a']}, p._compute_bibgroup_facet({'bibgroup': ['a']})) self.assertEqual({'bibgroup_facet': ['a', 'b']}, p._compute_bibgroup_facet({'bibgroup': ['a', 'b']})) self.assertEqual({'bibgroup_facet': ['a', 'b']}, p._compute_bibgroup_facet( {'bibgroup': ['a', 'b', 'a']}))
def test_compute_identifier(self): p = Processor() d = { 'bibcode': '2013MNRAS.435.1904M', 'deleted': ['2013MNRAS.tmp.2206M'], 'doi': ['10.1093/mnras/stt1379'], 'preprint': ['1307.6556'], 'pub2arxiv': ['2013arXiv1307.6556M'] } a = p._compute_identifier(d) self.assertEqual( { 'identifier': [ '2013MNRAS.435.1904M', '2013MNRAS.tmp.2206M', '10.1093/mnras/stt1379', 'arxiv:1307.6556', '2013arXiv1307.6556M' ] }, a) d = { 'bibcode': 'bib', 'deleted': ['deleted1', 'deleted2'], 'doi': ['doia', 'doib'], 'preprint': ['preprintA', 'preprintB', 'preprintC'], 'pub2arxiv': ['arxivFoo', 'arxivBar'] } a = p._compute_identifier(d) self.assertEqual( { 'identifier': [ 'bib', 'deleted1', 'deleted2', 'doia', 'doib', 'arxiv:preprintA', 'arxiv:preprintB', 'arxiv:preprintC', 'arxivFoo', 'arxivBar' ] }, a) d = { 'bibcode': 'bib', 'deleted': [], 'doi': [], 'preprint': [], 'pub2arxiv': [] } a = p._compute_identifier(d) self.assertEqual({'identifier': ['bib']}, a) d = {'bibcode': 'bib'} a = p._compute_identifier(d) self.assertEqual({'identifier': ['bib']}, a)
def test_nonbib_record(self): self.maxDiff = None with Processor(compute_metrics=False) as processor, patch( 'adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): d = processor._read_next_bibcode('2003ASPC..295..361M') n = processor._convert(d) a = { "read_count": 4, "bibcode": "2003ASPC..295..361M", "data_links_rows": [{ "url": [ "http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M" ], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': [''] }, { "url": [ "http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M" ], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': [''] }, { "url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': [''] }], "esource": ["ADS_PDF", "ADS_SCAN"], "property": [ "ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC" ], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0 } self.assertEqual(a, n) d = processor._read_next_bibcode('2004MNRAS.354L..31M') v = processor._convert(d) a = { "bibcode": "2004MNRAS.354L..31M", "simbad_objects": ["3253618 G"], "read_count": 20, "data_links_rows": [{ "url": ["http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x"], "link_type": "ESOURCE", "link_sub_type": "PUB_HTML", 'item_count': 0, 'title': [''] }, { "url": ["https://arxiv.org/abs/astro-ph/0405472"], "link_type": "ESOURCE", "link_sub_type": "EPRINT_HTML", 'item_count': 0, 'title': [''] }, { "url": [ "https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x" ], "link_type": "ESOURCE", "link_sub_type": "PUB_PDF", 'item_count': 0, 'title': [''] }, { "url": [ "http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M" ], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': [''] }, { "url": ["https://arxiv.org/pdf/astro-ph/0405472"], "link_type": "ESOURCE", "link_sub_type": "EPRINT_PDF", 'item_count': 0, 'title': [''] }, { "url": [ "http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M" ], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': [''] }, { "url": ["2004MNRAS.354L..31M", "2005yCat..73549031M"], "title": ["Source Paper", "Catalog Description"], "link_type": "ASSOCIATED", "link_sub_type": "NA", 'item_count': 0 }, { "url": ["http://inspirehep.net/search?p=find+j+MNRAA,354,L31"], "link_type": "INSPIRE", "link_sub_type": "NA", 'item_count': 0, 'title': [''] }, { "url": ["http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31"], "item_count": 1, "link_type": "DATA", "link_sub_type": "CDS", 'title': [''] }, { "url": [ "https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M" ], "title": ["NED Objects (1953)"], "item_count": 1953, "link_type": "DATA", "link_sub_type": "NED" }, { "url": ["http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M"], "title": ["SIMBAD Objects (1)"], "item_count": 1, "link_type": "DATA", "link_sub_type": "SIMBAD" }, { "url": ["http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31"], "item_count": 1, "link_type": "DATA", "link_sub_type": "Vizier", 'title': [''] }], "norm_cites": 10000, "data": ["CDS:1", "NED:1953", "SIMBAD:1", "Vizier:1"], "citation_count_norm": 49.5, "citation_count": 99, "property": [ "ADS_OPENACCESS", "ARTICLE", "ASSOCIATED", "DATA", "EPRINT_OPENACCESS", "ESOURCE", "INSPIRE", "OPENACCESS", "PUB_OPENACCESS", "REFEREED" ], "total_link_counts": 1956, "esource": [ "ADS_PDF", "ADS_SCAN", "EPRINT_HTML", "EPRINT_PDF", "PUB_HTML", "PUB_PDF" ], "boost": 0.4399999976158142 } v_boost = v.pop('boost') a_boost = a.pop('boost') self.assertAlmostEqual(a_boost, v_boost) self.assertEqual(a, v)
def test_with_citations(self): d = { 'canonical': "1997BoLMe..85..475M", 'refereed': { 'refereed': True }, 'author': [ "Meesters, A. G. C. A.", "Bink, N. J.", "Henneken, E. A. C.", "Vugts, H. F.", "Cannemeijer, F." ], 'download': [], 'reads': [], 'citations': [ "1998PPGeo..22..553A", "1999P&SS...47..951S", "2000BoLMe..97..385O", "2001MAP....78..115K", "2002BoLMe.103...49H", "2006QJRMS.132..779R", "2006QJRMS.132...61E", "2008Sci...320.1622D", "2016BoLMe.159..469G" ], 'reference': [ "1994BoLMe..71..393V", "1994GPC.....9...53M", "1994GPC.....9...53X" ] } m = mock_open(read_data='') m.return_value.__iter__ = lambda self: iter(self.readline, '') with patch('builtins.open', m), Processor(compute_metrics=True) as processor: for bib in d['citations']: Cache.get('citation')['1997BoLMe..85..475M'].append(bib) for bib in d['reference']: Cache.get('reference')['1997BoLMe..85..475M'].append(bib) refereed = [ '1997BoLMe..85..475M', "1999P&SS...47..951S", "2000BoLMe..97..385O", "2001MAP....78..115K", "2002BoLMe.103...49H", "2006QJRMS.132..779R", "2006QJRMS.132...61E", "2008Sci...320.1622D", "2016BoLMe.159..469G" ] for bib in refereed: Cache.get('refereed').add(bib) # 1999P&SS...47..951S PSSreferences = [ "1973JAtS...30...66B", "1973JAtS...30..749L", "1976JAtS...33..923B", "1977JGR....82.4121B", "1977JGR....82.4249K", "1977JGR....82.4559H", "1978Icar...33..417W", "1978JAtS...35.2346S", "1978JGR....83.1889D", "1979Icar...39..151H", "1979Icar...39..184H", "1979JGR....84.2889J", "1979JGR....84.2929P", "1979Natur.278..531H", "1981GeoRL...8..899R", "1981suma.book.....C", "1982JAtS...39.2701M", "1982JGR....87.9975M", "1982MWRv..110..994A", "1985AdSpR...5...93H", "1985PhDT.........2P", "1985TellA..37..156A", "1985wagp.book.....G", "1987MWRv..115..936Y", "1987MWRv..115.2214P", "1988aitb.book.....S", "1989BAMS...70..738B", "1990JAtS...47..612Y", "1990JGR....95.1359J", "1991ConAP..64..103S", "1992aitd.book.....H", "1992BoLMe..59..141G", "1992JGR....97.7781Z", "1993JAtS...50...77S", "1993JGR....98.3125B", "1994DPS....26.1806G", "1995Icar..113..277M", "1995JGR...100.5277H", "1995MWRv..123.1146H", "1996Icar..122...36C", "1996JGR...10114957S", "1996Sci...271..184S", "1997AdSpR..19.1241S", "1997AdSpR..19.1289M", "1997BoLMe..85..475M", "1997JGR...102.4463W", "1997Sci...278.1758S", "1998Sci...279.1686S" ] for bib in PSSreferences: Cache.get('reference')['1999P&SS...47..951S'].append(bib) met = processor._compute_metrics(d) self.assertEqual(len(met['citations']), len(d['citations']), 'citations check') self.assertEqual(met['refereed_citation_num'], len(refereed) - 1) self.assertEqual(met['refereed_citations'], refereed[1:]) rn_citation_data = { "cityear": 1998, "pubyear": 1997, "auth_norm": 0.20000000298023224, "bibcode": "1998PPGeo..22..553A", "ref_norm": 0.20000000298023224 } rn_citation_data1 = { "cityear": 1999, "pubyear": 1997, "auth_norm": 0.20000000298023224, "bibcode": "1999P&SS...47..951S", "ref_norm": 0.02083333395421505 } self.compare_citation_data(met['rn_citation_data'][0], rn_citation_data) self.compare_citation_data(met['rn_citation_data'][1], rn_citation_data1) y = int(d['canonical'][:4]) today = datetime.today() age = max(1.0, today.year - y + 1) self.assertAlmostEqual(met['an_refereed_citations'], len(met['refereed_citations']) / float(age), 5)
def test_add_data_summary(self): self.maxDiff = None with Processor(compute_metrics=False) as processor, patch( 'adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): data_links_rows = [{ "url": ["http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x"], "link_type": "ESOURCE", "link_sub_type": "PUB_HTML" }, { "url": ["https://arxiv.org/abs/astro-ph/0405472"], "link_type": "ESOURCE", "link_sub_type": "EPRINT_HTML" }, { "url": [ "https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x" ], "link_type": "ESOURCE", "link_sub_type": "PUB_PDF" }, { "url": ["http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF" }, { "url": ["https://arxiv.org/pdf/astro-ph/0405472"], "link_type": "ESOURCE", "link_sub_type": "EPRINT_PDF" }, { "url": [ "http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M" ], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN" }, { "url": ["2004MNRAS.354L..31M", "2005yCat..73549031M"], "title": ["Source Paper", "Catalog Description"], "link_type": "ASSOCIATED", "link_sub_type": "NA" }, { "url": ["http://inspirehep.net/search?p=find+j+MNRAA,354,L31"], "link_type": "INSPIRE", "link_sub_type": "NA" }, { "url": ["http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31"], "item_count": 1, "link_type": "DATA", "link_sub_type": "CDS" }, { "url": [ "https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M" ], "title": ["NED Objects (1953)"], "item_count": 1953, "link_type": "DATA", "link_sub_type": "NED" }, { "url": ["http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M"], "title": ["SIMBAD Objects (1)"], "item_count": 1, "link_type": "DATA", "link_sub_type": "SIMBAD" }, { "url": ["http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31"], "item_count": 1, "link_type": "DATA", "link_sub_type": "Vizier" }] d = {'data_links_rows': data_links_rows} processor._add_data_summary(d) self.assertEqual(["CDS:1", "NED:1953", "SIMBAD:1", "Vizier:1"], d['data'])
def test_read(self): """can we read in all the data for a bibcode""" with Processor(compute_metrics=False) as processor, patch( 'adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): d = processor._read_next_bibcode('1057wjlf.book.....C') self.assertEqual(d['canonical'], '1057wjlf.book.....C') self.assertEqual(len(d['author']), 1) self.assertEqual(d['author'], ['Chao, C']) self.assertEqual(d['citation'], [ '1800RSPT...90..255H', '1800RSPT...90..284H', '1800RSPT...90..437H', '1826AnP....82..133S', '1840RSPT..130....1H', '1883RSPT..174..935R', '1885Natur..32..245L', '1890GSAB....1..411D', '1893GSAB....5..225C', '1894GSAB....6..199C', '1896AN....140..161J', '1898ApJ.....7...86H', '1900JG......8..135D', '1901AnP...309..104H', '1902RSPTA.199....1J', '1903AnP...317..449L', '1903GSAB...14..227N', '1904PhRvI..18..355N', '1905PhRvI..21..247N', '1905Sci....22..572G', '1906PhRvI..22..279N', '1906PhRvI..23...37N', '1906tdiu.book.....N', '1907AnP...329..164W', '1907PhRvI..25..362N', '1908AnP...330..377M', '1908Natur..78..366R', '1908PhRvI..26..312P', '1908PhRvI..26..454P', '1908PhRvI..27..209W', '1908PhRvI..27..367N', '1909AnP...333...75K', '1909RSPSA..82..172S', '1911PhRvI..32..492C', '1913LowOB...2...56S', '1913PhRv....2..450L', '1915AnP...353.1103S', '1915PA.....23...21S', '1916JG.....24..313B', '1917PhyZ...18..121E', '1917RSPSA..93..148R', '1918AJ.....31..185H', '1918ApJ....47....1M', '1920LicOB..10...64B', '1920Natur.105....8A', '1920Natur.106..468A', '1921ApJ....53..121B', '1921Natur.107..334A', '1921PhRv...18...31H', '1922Natur.109..813A', '1922Natur.110..664A', '1922Natur.110..732A', '1922PhRv...19..246M', '1922ZPhy...10..377F', '1924Natur.114..717A', '1924RSPSA.106..749B', '1925JOSA...11..233T', '1926RSPSA.110..709R', '1927ASSB...47...49L', '1927TeMAE..32..173W', '1928PCPS...24..180F', '1928RSPSA.119..173F', '1929PNAS...15..168H', '1929PhRv...33..954L', '1929PhRv...34...57M', '1929ZPhy...52..853K', '1930AnP...397..325B', '1930ApJ....71..102P', '1930JG.....38...88Q', '1930PhRv...35.1303E', '1930ZPhy...63..245L', '1931AnP...402..715K', '1931MNRAS..91..483L', '1931NW.....19..964K', '1931PhRv...37..405O', '1931PhRv...37.1276F', '1931PhRv...38.1827P', '1931RSPSA.132..487A', '1932AnP...406..531M', '1932JG.....40..305A', '1932PNAS...18..213E', '1932PhRv...39.1021B', '1932PhRv...41..364P', '1932PhRv...41..369B', '1932RSPSA.137..696Z', '1933ASSB...53...51L', '1933JChPh...1..515B', '1933RSPSA.142..142M', '1933RvMP....5...62R', '1933ZPhy...81..313M', '1933ZPhy...81..445B', '1933ZPhy...81..465F', '1934ApJ....79....8H', '1934GSAB...45.1017H', '1934JChPh...2..599O', '1934JRASC..28..303V', '1934PhRv...45....1B', '1934PhRv...45..488S', '1934RSPSA.146..483F' ]) self.assertEqual(d['download'], [ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 ]) self.assertFalse(d['grants']) self.assertFalse(d['ned_objects']) self.assertTrue(d['nonarticle']) self.assertEqual(d['ocrabstract'], {'ocrabstract': False}) self.assertEqual(d['private'], {'private': False}) self.assertEqual(d['pub_openaccess'], {'pub_openaccess': False}) self.assertEqual(d['readers'], ['4fc45951aa', '557ebfd055', '57fcb9018a']) self.assertEqual(d['reads'], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 6 ]) self.assertEqual(d['refereed'], {'refereed': False}) self.assertEqual( d['relevance'], { 'norm_cites': 0, 'read_count': 25, 'boost': 0.32, 'deprecated_citation_count': 0 })
def main(): parser = argparse.ArgumentParser( description= 'Process nonbib input data files and send data to master pipeline') subparsers = parser.add_subparsers(help='commands', dest="action", required=True) diff_parser = subparsers.add_parser( 'COMPUTE_DIFF', help= 'Compute changed bibcodes by comparing current and previous data sets. Changed bibcodes are stored in the file ./logs/input/current/changedBibcodes.txt.' ) file_parser = subparsers.add_parser( 'PROCESS_FILE', help= 'Send nonbib and metrics protobufs to master for the list of bibcodes in the provided file' ) file_parser.add_argument('input_filename', action='store', type=str, help='Path to input file, required.') file_parser.add_argument( '--no-metrics', action='store_false', dest='compute_metrics', help= 'Only send nonbib protobufs to master, do not init cache or send metrics protobufs' ) bibcodes_parser = subparsers.add_parser( 'PROCESS_BIBCODES', help= 'Send data to master for the bibcodes provided on the command line.') bibcodes_parser.add_argument('--bibcodes', action='store', default=None, dest='bibcodes', nargs='+', required=True, type=str, help='Space delimited list of bibcodess.') bibcodes_parser.add_argument( '--no-metrics', dest='compute_metrics', action='store_false', help= 'Only send nonbib protobufs to master, do not init cache or send metrics protobufs.' ) args = parser.parse_args() if args.action == 'COMPUTE_DIFF': Diff.compute() else: # where with PROCESS_BIBCODES or PROCESS_FILE if args.compute_metrics: Cache.init() if args.action == 'PROCESS_BIBCODES': # parse and sort bibcodes = args.bibcodes.sort() with Processor(compute_metrics=args.compute_metrics) as processor: processor.process_bibcodes(bibcodes) print('processedbibcodes {}'.format(bibcodes)) elif args.action == 'PROCESS_FILE': Diff.execute('sort -o {} {}'.format(args.input_filename, args.input_filename)) # send bibcodes from file to processing in batches count = 0 bibcodes = [] with open(args.input_filename, 'r') as f, Processor( compute_metrics=args.compute_metrics) as processor: for line in f: if count % 10000 == 0: print('{}: processed bibcodes count = {}'.format( datetime.datetime.now(), count)) count = count + 1 line = line.strip() if line: bibcodes.append(line) if len(bibcodes) % 100 == 0: processor.process_bibcodes(bibcodes) bibcodes = [] if len(bibcodes) > 0: processor.process_bibcodes(bibcodes) print( '{}: completed processing bibcodes from {}, count = {}'.format( datetime.datetime.now(), args.input_filename, count))