if not options.input: optparser.error("No input file defined") if not options.output: optparser.error("No output file defined") if not options.dictionary: optparser.error("No dictionary file defined") return options, args if __name__=="__main__": options, args = getOptions() f = gzip.GzipFile(options.dictionary, 'r') dictionary = readDictionaryMapping(f) f.close() f = gzip.GzipFile(options.input, 'r') documents = GraphMatrices.readInstances(f) instances = GraphMatrices.buildAMFromFullSentences(documents, MatrixBuilders.buildAdjacencyMatrix, settings, options.parser, options.tokenizer) f.close() f = gzip.GzipFile(options.output,'w') datavector = [] identities = [] for document in instances.itervalues(): for sentence in document.itervalues(): for identity, instance in sentence.iteritems(): identities.append(identity) datavector.append(instance) outputs = [x[2] for x in datavector] datavector = [(x[0], x[1]) for x in datavector] datavector = GraphMatrices.LinearizeGraphs(datavector, dictionary, options.mode) for id, output, features in zip(identities, outputs, datavector): keys = features.keys()
argparser = ArgumentParser(usage="%prog [options]\n-h for help") argparser.add_argument("-i", "--input", dest="input", help="Gzipped xml-file containing the parsed data") argparser.add_argument("-o", "--output", dest="output", help="Output file for writing the dictionary") argparser.add_argument("-p", "--parser", dest="parser", help="Name of the parser", default="split_parse") argparser.add_argument("-t", "--tokenizer", dest="tokenizer", help="Name of the tokenizer", default="split") args = argparser.parse_args() if not args.input: argparser.error("No input file defined") if not args.output: argparser.error("No output file defined") return args if __name__ == "__main__": args = getOptions() with gzip.open(args.input, 'r') as f: documents = GraphMatrices.readInstances(f) instances = GraphMatrices.buildAMFromFullSentences(documents, MatrixBuilders.buildAdjacencyMatrix, settings, args.parser, args.tokenizer) datavector = [] for document in instances.itervalues(): for sentence in document.itervalues(): for identity, instance in sentence.iteritems(): datavector.append(instance) datavector = [(x[0], x[1]) for x in datavector] fmap = GraphMatrices.buildDictionary(datavector) with gzip.open(args.output, 'w') as dict_out: for key in fmap.keys():