Пример #1
0
 def test_after_date(self):
     parser = WOSStream(SMALL_XML, date_after=datetime.datetime.strptime("2015", "%Y"))
     for entry in parser.parse():
         raise "This shouldn't ever happen"
     parser = WOSStream(SMALL_XML, date_after=datetime.datetime.strptime("2014", "%Y"))
     for entry in parser.parse():
         self.assertDictEqual(entry, SMALL_PARSED)
Пример #2
0
def wos_parser(files, entries, wos_only, sample_rate, must_cite, batch_size,
               date_after, filter_set):
    log = logging.getLogger(__name__).getChild('parser')
    batch = []
    filtered_out = 0
    wrote = 0
    for filename in iter(files.get, 'STOP'):
        with open_file(filename) as f:
            p = WOSStream(f,
                          wos_only=wos_only,
                          sample_rate=sample_rate,
                          must_cite=must_cite,
                          date_after=date_after)
            for entry in p.parse():
                if filter_set and entry["id"] not in filter_set:
                    filtered_out += 1
                    continue
                batch.append(entry)
                if len(batch) >= batch_size:
                    entries.put(batch)
                    batch = []
                    wrote += batch_size
        if len(batch):
            entries.put(batch)
            wrote += len(batch)
            batch = []
        files.task_done()

    log.info("Wrote %s entries", wrote)
    if filter_set:
        log.info("Excluded %s entries", filtered_out)

    files.task_done()
Пример #3
0
def wos_parser(files, entries, wos_only, sample_rate, must_cite, batch_size, date_after, filter_set):
    log = logging.getLogger(__name__).getChild('parser')
    batch = []
    filtered_out = 0
    wrote = 0
    for filename in iter(files.get, 'STOP'):
        with open_file(filename) as f:
            p = WOSStream(f, wos_only=wos_only, sample_rate=sample_rate, must_cite=must_cite, date_after=date_after)
            for entry in p.parse():
                if filter_set and entry["id"] not in filter_set:
                    filtered_out += 1
                    continue
                batch.append(entry)
                if len(batch) >= batch_size:
                    entries.put(batch)
                    batch = []
                    wrote += batch_size
        if len(batch):
            entries.put(batch)
            wrote += len(batch)
            batch = []
        files.task_done()

    log.info("Wrote %s entries", wrote)
    if filter_set:
        log.info("Excluded %s entries", filtered_out)

    files.task_done()
Пример #4
0
    def test_parse_nowos(self):
        parser = WOSStream(NOWOS_XML, wos_only=True)
        entries = [e for e in parser.parse()]
        result = SMALL_PARSED
        result["title"] = "Potatoes"
        result["citations"] = []
        self.assertDictEqual(entries[0], result)

        parser = WOSStream(NOWOS_XML, wos_only=True, must_cite=True)
        entries = [e for e in parser.parse()]
        self.assertListEqual(entries, [])
Пример #5
0
def wos_parser(files, entries, wos_only, sample_rate, must_cite, batch_size, date_after):
    batch = []
    for filename in iter(files.get, 'STOP'):
        with open_file(filename) as f:
            p = WOSStream(f, wos_only=wos_only, sample_rate=sample_rate, must_cite=must_cite, date_after=date_after)
            for entry in p.parse():
                batch.append(entry)
                if len(batch) >= batch_size:
                    entries.put(batch)
                    batch = []
        if len(batch):
            entries.put(batch)
            batch = []
        files.task_done()
    files.task_done()
Пример #6
0
def wos_parser(files, entries, wos_only, sample_rate, must_cite, batch_size,
               date_after):
    batch = []
    for filename in iter(files.get, 'STOP'):
        with open_file(filename) as f:
            p = WOSStream(f,
                          wos_only=wos_only,
                          sample_rate=sample_rate,
                          must_cite=must_cite,
                          date_after=date_after)
            for entry in p.parse():
                batch.append(entry)
                if len(batch) >= batch_size:
                    entries.put(batch)
                    batch = []
        if len(batch):
            entries.put(batch)
            batch = []
        files.task_done()
    files.task_done()
Пример #7
0
 def test_after_date(self):
     parser = WOSStream(SMALL_XML,
                        date_after=datetime.datetime.strptime("2015", "%Y"))
     for entry in parser.parse():
         raise "This shouldn't ever happen"
     parser = WOSStream(SMALL_XML,
                        date_after=datetime.datetime.strptime("2014", "%Y"))
     for entry in parser.parse():
         self.assertDictEqual(entry, SMALL_PARSED)
Пример #8
0
    def test_parse_nowos(self):
        parser = WOSStream(NOWOS_XML, wos_only=True)
        entries = [e for e in parser.parse()]
        result = SMALL_PARSED
        result["title"] = "Potatoes"
        result["citations"] = []
        self.assertDictEqual(entries[0], result)

        parser = WOSStream(NOWOS_XML, wos_only=True, must_cite=True)
        entries = [e for e in parser.parse()]
        self.assertListEqual(entries, [])
Пример #9
0
                        '--benchmark_freq',
                        help="How often to emit benchmark info",
                        type=int,
                        default=1000000)
    parser.add_argument('infile', nargs='+')
    arguments = parser.parse_args()

    date_after = None
    if arguments.after:
        date_after = datetime.datetime.strptime(arguments.after, "%Y")

    b = Benchmark()

    for file_name in arguments.infile:
        with open_file(file_name, "r") as f:
            p = WOSStream(f, arguments.wos_only, arguments.sample_rate,
                          arguments.must_cite, date_after)
            output_file = "%s.json" % os.path.basename(f.name).split(
                ".", maxsplit=1)[0]

            if arguments.outdir:
                output_file = os.path.join(arguments.outdir, output_file)

            if not arguments.force and os.path.isfile(output_file):
                print("%s already exists, skipping..." % output_file)
                break

            with open(output_file, "w", encoding="utf-8") as g:
                for entry in p.parse():
                    dump(entry, g, ensure_ascii=False)
                    g.write("\n")
                    b.increment()
Пример #10
0
    parser.add_argument("--must-cite", action="store_true", help="Only include nodes that cite other nodes")
    parser.add_argument("-f", "--force", help="If outptut file already exists overwrite it.", action="store_true")
    parser.add_argument("-a", "--after", help="Only include nodes published on or after this year")
    parser.add_argument("-bf", "--benchmark_freq", help="How often to emit benchmark info", type=int, default=1000000)
    parser.add_argument("infile", nargs="+")
    arguments = parser.parse_args()

    date_after = None
    if arguments.after:
        date_after = datetime.datetime.strptime(arguments.after, "%Y")

    b = Benchmark()

    for file_name in arguments.infile:
        with open_file(file_name, "r") as f:
            p = WOSStream(f, arguments.wos_only, arguments.sample_rate, arguments.must_cite, date_after)
            output_file = "%s.json" % os.path.basename(f.name).split(".", maxsplit=1)[0]

            if arguments.outdir:
                output_file = os.path.join(arguments.outdir, output_file)

            if not arguments.force and os.path.isfile(output_file):
                print("%s already exists, skipping..." % output_file)
                break

            with open(output_file, "w", encoding="utf-8") as g:
                for entry in p.parse():
                    dump(entry, g, ensure_ascii=False)
                    g.write("\n")
                    b.increment()
    b.print_freq()
Пример #11
0
 def test_parse_medium(self):
     parser = WOSStream(MEDIUM_XML)
     for entry in parser.parse():
         self._pp.pprint(entry)
Пример #12
0
 def test_parse_small(self):
     parser = WOSStream(SMALL_XML)
     for entry in parser.parse():
         self.assertDictEqual(entry, SMALL_PARSED)
Пример #13
0
 def test_parse_medium(self):
     parser = WOSStream(MEDIUM_XML)
     for entry in parser.parse():
         if "subject" in entry:
             print(entry["subject"])
Пример #14
0
 def test_parse_medium(self):
     parser = WOSStream(MEDIUM_XML)
     for entry in parser.parse():
         if "subject" in entry:
             print(entry["subject"])
Пример #15
0
 def test_parse_medium(self):
     parser = WOSStream(MEDIUM_XML)
     for entry in parser.parse():
         self._pp.pprint(entry)
Пример #16
0
 def test_parse_small(self):
     parser = WOSStream(SMALL_XML)
     for entry in parser.parse():
         self.assertDictEqual(entry, SMALL_PARSED)
Пример #17
0
#!/usr/bin/env python3
from parsers.wos import WOSStream
from util.PajekFactory import PajekFactory
from util.misc import open_file, Benchmark

if __name__ == "__main__":
    import argparse
    import sys
    parser = argparse.ArgumentParser(
        description="Creates Pajek (.net) files from WOS XML")
    parser.add_argument('infile')
    parser.add_argument('outfile',
                        nargs='?',
                        type=argparse.FileType('w'),
                        default=sys.stdout)
    arguments = parser.parse_args()

    with open_file(arguments.infile) as f:
        p = WOSStream(f)

        for entry in p.parse():
            for citation in entry["citations"]:
                arguments.outfile.write("%s\t%s\n" % (entry["id"], citation))