def test_after_date(self): parser = WOSStream(SMALL_XML, date_after=datetime.datetime.strptime("2015", "%Y")) for entry in parser.parse(): raise "This shouldn't ever happen" parser = WOSStream(SMALL_XML, date_after=datetime.datetime.strptime("2014", "%Y")) for entry in parser.parse(): self.assertDictEqual(entry, SMALL_PARSED)
def wos_parser(files, entries, wos_only, sample_rate, must_cite, batch_size, date_after, filter_set): log = logging.getLogger(__name__).getChild('parser') batch = [] filtered_out = 0 wrote = 0 for filename in iter(files.get, 'STOP'): with open_file(filename) as f: p = WOSStream(f, wos_only=wos_only, sample_rate=sample_rate, must_cite=must_cite, date_after=date_after) for entry in p.parse(): if filter_set and entry["id"] not in filter_set: filtered_out += 1 continue batch.append(entry) if len(batch) >= batch_size: entries.put(batch) batch = [] wrote += batch_size if len(batch): entries.put(batch) wrote += len(batch) batch = [] files.task_done() log.info("Wrote %s entries", wrote) if filter_set: log.info("Excluded %s entries", filtered_out) files.task_done()
def test_parse_nowos(self): parser = WOSStream(NOWOS_XML, wos_only=True) entries = [e for e in parser.parse()] result = SMALL_PARSED result["title"] = "Potatoes" result["citations"] = [] self.assertDictEqual(entries[0], result) parser = WOSStream(NOWOS_XML, wos_only=True, must_cite=True) entries = [e for e in parser.parse()] self.assertListEqual(entries, [])
def wos_parser(files, entries, wos_only, sample_rate, must_cite, batch_size, date_after): batch = [] for filename in iter(files.get, 'STOP'): with open_file(filename) as f: p = WOSStream(f, wos_only=wos_only, sample_rate=sample_rate, must_cite=must_cite, date_after=date_after) for entry in p.parse(): batch.append(entry) if len(batch) >= batch_size: entries.put(batch) batch = [] if len(batch): entries.put(batch) batch = [] files.task_done() files.task_done()
'--benchmark_freq', help="How often to emit benchmark info", type=int, default=1000000) parser.add_argument('infile', nargs='+') arguments = parser.parse_args() date_after = None if arguments.after: date_after = datetime.datetime.strptime(arguments.after, "%Y") b = Benchmark() for file_name in arguments.infile: with open_file(file_name, "r") as f: p = WOSStream(f, arguments.wos_only, arguments.sample_rate, arguments.must_cite, date_after) output_file = "%s.json" % os.path.basename(f.name).split( ".", maxsplit=1)[0] if arguments.outdir: output_file = os.path.join(arguments.outdir, output_file) if not arguments.force and os.path.isfile(output_file): print("%s already exists, skipping..." % output_file) break with open(output_file, "w", encoding="utf-8") as g: for entry in p.parse(): dump(entry, g, ensure_ascii=False) g.write("\n") b.increment()
parser.add_argument("--must-cite", action="store_true", help="Only include nodes that cite other nodes") parser.add_argument("-f", "--force", help="If outptut file already exists overwrite it.", action="store_true") parser.add_argument("-a", "--after", help="Only include nodes published on or after this year") parser.add_argument("-bf", "--benchmark_freq", help="How often to emit benchmark info", type=int, default=1000000) parser.add_argument("infile", nargs="+") arguments = parser.parse_args() date_after = None if arguments.after: date_after = datetime.datetime.strptime(arguments.after, "%Y") b = Benchmark() for file_name in arguments.infile: with open_file(file_name, "r") as f: p = WOSStream(f, arguments.wos_only, arguments.sample_rate, arguments.must_cite, date_after) output_file = "%s.json" % os.path.basename(f.name).split(".", maxsplit=1)[0] if arguments.outdir: output_file = os.path.join(arguments.outdir, output_file) if not arguments.force and os.path.isfile(output_file): print("%s already exists, skipping..." % output_file) break with open(output_file, "w", encoding="utf-8") as g: for entry in p.parse(): dump(entry, g, ensure_ascii=False) g.write("\n") b.increment() b.print_freq()
def test_parse_medium(self): parser = WOSStream(MEDIUM_XML) for entry in parser.parse(): self._pp.pprint(entry)
def test_parse_small(self): parser = WOSStream(SMALL_XML) for entry in parser.parse(): self.assertDictEqual(entry, SMALL_PARSED)
def test_parse_medium(self): parser = WOSStream(MEDIUM_XML) for entry in parser.parse(): if "subject" in entry: print(entry["subject"])
#!/usr/bin/env python3 from parsers.wos import WOSStream from util.PajekFactory import PajekFactory from util.misc import open_file, Benchmark if __name__ == "__main__": import argparse import sys parser = argparse.ArgumentParser( description="Creates Pajek (.net) files from WOS XML") parser.add_argument('infile') parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout) arguments = parser.parse_args() with open_file(arguments.infile) as f: p = WOSStream(f) for entry in p.parse(): for citation in entry["citations"]: arguments.outfile.write("%s\t%s\n" % (entry["id"], citation))