def cat(self, path, opts): addedopts = getopts(opts, ['libjar'], delete=False) streamingjar = findjar(self.hadoop, 'streaming') if not streamingjar: print >> sys.stderr, 'ERROR: Streaming jar not found' return 1 hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], shortcuts=dict(configopts('jars'))) try: import typedbytes ls = os.popen('%s %s dfs -ls %s' % (hadenv, self.hdfs, path)) if sum(c in path for c in ("*", "?", "{")) > 0: # cat each file separately when the path contains special chars lineparts = (line.split()[-1] for line in ls) subpaths = [part for part in lineparts if part.startswith("/")] else: # we still do the ls even in this case to make sure we print errors subpaths = [path] ls.close() for subpath in subpaths: if subpath.endswith("/_logs"): continue dumptb = os.popen('%s %s/bin/hadoop jar %s dumptb %s 2> /dev/null' % (hadenv, self.hadoop, streamingjar, subpath)) ascodeopt = getopt(opts, 'ascode') if ascodeopt and ascodeopt[0] == 'yes': outputs = dumpcode(typedbytes.PairedInput(dumptb)) else: outputs = dumptext(typedbytes.PairedInput(dumptb)) for output in outputs: print '\t'.join(output) dumptb.close() except IOError: pass # ignore return 0
def cat(self, path, opts): streamingjar = findjar( self.hadoop, 'streaming', opts['hadooplib'] if 'hadooplib' in opts else None) if not streamingjar: print >> sys.stderr, 'ERROR: Streaming jar not found' return 1 hadenv = envdef('HADOOP_CLASSPATH', opts['libjar'], shortcuts=dict(configopts('jars'))) try: import typedbytes ls = os.popen('%s %s -ls %s' % (hadenv, self.hdfs, path)) subpaths = [line.split()[-1] for line in ls if ":" in line] ls.close() for subpath in subpaths: if subpath.split("/")[-1].startswith("_"): continue dumptb = os.popen( '%s %s/bin/hadoop jar %s dumptb %s 2> /dev/null' % (hadenv, self.hadoop, streamingjar, subpath)) dump = dumpcode if 'yes' in opts['ascode'] else dumptext outputs = dump(typedbytes.PairedInput(dumptb)) for output in outputs: print '\t'.join(output) dumptb.close() except IOError: pass # ignore return 0
def testpairio(self): objects = TestIO.objects file = open("test.bin", "wb") output = typedbytes.PairedOutput(file) output.writes(enumerate(objects)) file.close() file = open("test.bin", "rb") input = typedbytes.PairedInput(file) for index, record in input: self.assertEqual(objects[index], record) file.close() os.remove("test.bin")
import typedbytes import sys b = typedbytes.PairedInput(sys.stdin) c = typedbytes.PairedOutput(sys.stdout) c.writes(b)
def parse_tb(val): fp = StringIO.StringIO(val) for x in typedbytes.PairedInput(fp): yield x
# Loads a corpus of sentences in typedbytes format from Hadoop into a database. import os os.environ['DJANGO_SETTINGS_MODULE'] = 'settings' import typedbytes from ratings.models import * file = open("/Users/pealco/archive/experiments/disagreement/data.tb", 'rb') input = typedbytes.PairedInput(file) for sha1, sentence in input: s = Sentence(sha1=sha1, sentence=sentence.sentence, grammatical=sentence.grammatical, similarity=sentence.wup_similarity) s.save() dg = DirectedGraph() dg.save() nodelist = [Node() for node in sentence.dg.nodelist] [node.save() for node in nodelist] subject_address = sentence.subject['address'] intervenor_address = sentence.intervenor['address'] verb_address = verb.intervenor['address'] for node in sentence.dg.nodelist: