def main():
  if len(sys.argv) != 5:
    print("Usage: %s xgappfile source_folder gate_output_folder target_folder" % (sys.argv[0]));
    print("Source folder should only contain plaintext profile data for processing");
    return

  _, xgappfile, source_folder, gate_output_folder, target_folder = sys.argv

  xgapp_canonical = os.path.realpath(xgappfile)
  src_canonical = os.path.realpath(source_folder)
  target_canonical = os.path.realpath(target_folder)

  try:
    os.makedirs(gate_output_folder)
  except:
    pass
  gateannotator.run_gate_annotator(xgapp_canonical, src_canonical, gate_output_folder)

  for infile, outfile in util.traverse_mirror(gate_output_folder, target_folder):
    print("*** Converting %s to %s" % (infile, outfile))
    xml = ie.convert_file(infile)
    with open(outfile, "w") as f:
      f.write(xml)
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        description='Searcher, scraper and RDF converter for EuroNext.'
    )

    subparser = parser.add_subparsers(help='commands', dest='command')

    # Search command
    search_command = subparser.add_parser('search', help='Search EuroNext website')
    search_subcommands = search_command.add_subparsers(help='Search commands', dest='search_subcommand')
    search_command.add_argument('-o', dest='output', help='Write search results to file, which can be used as input to the scrape command')
    search_command.add_argument('--max-results', default=None, type=int, dest='maxresults', help='Maximum results from search')

    keyword_command = search_subcommands.add_parser('keyword', help='Search EuroNext website by keyword')
    keyword_command.add_argument('keyword', help='Keyword to search by')
    icb_command = search_subcommands.add_parser('icb', help='Search EuroNext website by ICB code')
    icb_command.add_argument('icb', help='ICB code to search by (e.g. 7000 will find all matching 7XXX)')

    def add_pickle_argument(command):
        command.add_argument('--pickle',action='store_true', default=False,
            help='Output as pickled objects. Can be converted to RDF using the " + \
           "rdfconvert command. Used to allow changes to the RDF format without having to write converters for RDF output files')
    def add_extract_profiles_argument(command):
        command.add_argument('--extract-profiles', dest='extract_profiles',
            help='Extract cp:profile as text files into the given folder, which can then be processed with GATE.')

    # Scrape commands
    scrapeone_command = subparser.add_parser('scrapeone', help='Scrape a page from EuroNext given ISIN and MIC')
    scrapeone_command.add_argument('isin', help='ISIN number of company')
    scrapeone_command.add_argument('mic', help='ISO-10383 MIC for company (in URL of source URL)')
    scrapeone_command.add_argument('outputfile', help='Path to a writable output file')
    add_pickle_argument(scrapeone_command)

    scrape_command = subparser.add_parser('scrape', help='Scrape from a file')
    scrape_command.add_argument('inputfile', help='Path to file containing space-separated ISINs and MICs, one per line.' + \
      " Can be generated with the 'search' command.")
    scrape_command.add_argument('outputdir', help='Path to a writeable output directory')
    add_pickle_argument(scrape_command)

    # rdfconvert command
    rdfconvert_command = subparser.add_parser('rdfconvert', help='Convert pickled objects to RDF')
    rdfconvert_command.add_argument('inputpath', help='Source file or folder (if --batch)')
    rdfconvert_command.add_argument('outputpath', help='Destination file or folder (if --batch)')
    rdfconvert_command.add_argument('--batch', action='store_true', default=False, help='Convert all .pickle files recursively in "inputpath"')

    extract_profiles_command = subparser.add_parser('extractprofiles',
      help='Extract cp:profile as text files into the given folder, which can then be processed with GATE')
    extract_profiles_command.add_argument('inputdir', help='Directory containing cp:graphs')
    extract_profiles_command.add_argument('outputdir', help='Output directory')

    args = parser.parse_args()

    if args.command == 'search':
        if hasattr(args, 'keyword'):
            search(keyword=args.keyword, outputfile=args.output, maxresults=args.maxresults)
        elif hasattr(args, 'icb'):
            search(icb=args.icb, outputfile=args.output, maxresults=args.maxresults)
    elif args.command == 'scrapeone':
        scrape(args.isin, args.mic, args.outputfile, args.pickle)
    elif args.command == 'scrape':
        with open(args.inputfile) as f:
            isins_mics = util.read_space_delimited_file(f)
        for isin_mic in isins_mics:
            extension = 'pickle' if args.pickle else 'n3'
            timestamp = int(time.time() * 1000)
            outputfile = "%s/%s-%s-%s.%s" % (args.outputdir, isin_mic[0], isin_mic[1], timestamp, extension)
            print("Scraping %s, %s to %s" % (isin_mic[0], isin_mic[1], outputfile))
            try:
                scrape(isin_mic[0], isin_mic[1], outputfile, args.pickle, timestamp=timestamp)
            except Exception as e:
                logger.exception("Failed to scrape %s" % isin_mic)
    elif args.command == 'rdfconvert':
        if args.batch:
            files = list(util.traverse_mirror(args.inputpath, args.outputpath, '.pickle', '.n3'))
        else:
            files = [(args.inputpath, args.outputpath)]

        for inputfile, outputfile in files:
            print("Converting %s to %s" % (inputfile, outputfile))
            with open(inputfile, 'rb') as f:
                scraped = pickle.load(f)
                rdfconvert(scraped, outputfile)
    elif args.command == 'extractprofiles':
        for directory, file in util.traverse(args.inputdir, '.n3'):
            # Output into language-specific folder
            inputfile = directory + os.sep + file
            g = Graph()
            g.parse(inputfile, format='n3')
            for cp_id, _, profile in g.triples((None, NS['cp']['profile'], None)):
                outdir = args.outputdir + os.sep + profile.language
                if not os.path.exists(outdir):
                    os.makedirs(outdir)

                id = cp_id.split('#')[1]
                outputfile = "%s/%s.txt" % (outdir, id)
                print(inputfile, "->", outputfile)
                with open(outputfile, "w+") as f:
                    # Replace HTML tags so they do not get passed as input to gate
                    profile_clean = re.sub(r'<[^>]+>', '\n', profile)
                    f.write(profile_clean)
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        description='Searcher, scraper and RDF converter for EuroNext.'
    )

    subparser = parser.add_subparsers(help='commands', dest='command')

    # Search command
    search_command = subparser.add_parser('search', help='Search Deutsche Borse index constituents')
    search_command.add_argument('isin', help='ISIN of the Deutsche Borse index')
    search_command.add_argument('-o', dest='output_file', help='Output file for results')

    def add_pickle_argument(command):
        command.add_argument('--pickle',action='store_true', default=False,
            help='Output as pickled objects. Can be converted to RDF using the " + \
           "rdfconvert command. Used to allow changes to the RDF format without having to write converters for RDF output files')

    # Scrape commands
    scrapeone_command = subparser.add_parser('scrapeone', help='Scrape a page given ISIN')
    scrapeone_command.add_argument('isin', help='ISIN number of company')
    scrapeone_command.add_argument('outputfile', help='Path to a writable output file')
    add_pickle_argument(scrapeone_command)

    scrape_command = subparser.add_parser('scrape', help='Scrape from a file')
    scrape_command.add_argument('inputfile', help='Path to file containing space-separated ISINs and MICs, one per line.' + \
      " Can be generated with the 'search' command.")
    scrape_command.add_argument('outputdir', help='Path to a writeable output directory')
    add_pickle_argument(scrape_command)

    # rdfconvert command
    rdfconvert_command = subparser.add_parser('rdfconvert', help='Convert pickled objects to RDF')
    rdfconvert_command.add_argument('inputpath', help='Source file or folder (if --batch)')
    rdfconvert_command.add_argument('outputpath', help='Destination file or folder (if --batch)')
    rdfconvert_command.add_argument('--batch', action='store_true', default=False, help='Convert all .pickle files recursively in "inputpath"')

    args = parser.parse_args()

    if args.command == 'search':
        search_index_constituents(args.isin, output_file=args.output_file)
    elif args.command == 'scrapeone':
        scrape(args.isin, args.outputfile, args.pickle)
    elif args.command == 'scrape':
        extension = 'pickle' if args.pickle else 'n3'
        with open(args.inputfile) as f:
            isins = [l[0] for l in util.read_space_delimited_file(f)]
            print(len(isins), " ISINs found")
        for i, isin in enumerate(isins):
            timestamp = int(time.time() * 1000)
            outputfile = "%s/%s-%s.%s" % (args.outputdir, isin, timestamp, extension)
            print("%d. Scraping %s to %s" % (i+1, isin, outputfile))
            try:
                scrape(isin, outputfile, args.pickle, timestamp=timestamp)
            except Exception as e:
                logger.exception("Failed to scrape %s: %s", isin, str(e))
            time.sleep(1)
    elif args.command == 'rdfconvert':
        if args.batch:
            files = list(util.traverse_mirror(args.inputpath, args.outputpath, '.pickle', '.n3'))
        else:
            files = [(args.inputpath, args.outputpath)]

        for inputfile, outputfile in files:
            print("Converting %s to %s" % (inputfile, outputfile))
            with open(inputfile, 'rb') as f:
                scraped = pickle.load(f)
                rdfconvert(scraped, outputfile)