示例#1
3
def extract_data(input_file_name, output_file_names):
    sh.unzip("-o", input_file_name)

    # This also updates timestamps.  Ruffus doesn't recognize these files as complete results unles the
    # timestamp is up to date.
    sh.mv("testdata.manual.2009.06.14.csv", "sentiment140.test.csv")
    sh.mv("training.1600000.processed.noemoticon.csv", "sentiment140.train.csv")

    # Re-encode the files as utf8.  They look like utf8 already (e.g. file thinks they're utf8)
    # but they are actually encoded as latin1.  This doesn't make a difference for the test data
    # (the utf8 and latin1 encoded test data are identical files) but the train data has some
    # byte sequences that are invalid utf8 and this makes simplejson really upset.
    for output_file in output_file_names:
        sh.mv(output_file, "temp")
        sh.iconv("-f", "latin1", "-t", "utf8", "temp", _out=output_file)
        sh.rm("temp")
	def import_csv(self, username, userpass, filestream):
		retCode = True
		import_xml_command = sh.Command("/usr/share/ds-matricula-plugin/matricula-common-scripts/1-itaca2mysql")
		# set a temporary filename
		TMP_CSV = tempfile.mkstemp()[1]
		TMP_CSV2 = tempfile.mkstemp()[1]
		if self._put_file(TMP_CSV, filestream):
			TMP_XML = tempfile.mkstemp()[1]
			sh.sed(sh.iconv("-f", "ISO-8859-15", "-t", "UTF-8", TMP_CSV), "-e", "1{s%[[:blank:]]%%g;s%\.%%g;s%[ñÑ]%n%g;s%.*%\L&%}", _out=TMP_CSV2)

			if self._csv2xml(TMP_CSV2, TMP_XML):
				try:
					import_xml_command(TMP_XML)
				except ErrorReturnCode:
					# some error happened
					retCode = False

			os.remove(TMP_XML)
		os.remove(TMP_CSV)
		os.remove(TMP_CSV2)
		return retCode
示例#3
1
    def process(self):
        if not self.infile:
            logging.fatal("no input specified")
        if self.pro_type == 'line':
            with open(self.infile) as inf, open(self.outfile, 'ab') as of, open(mid_datadir + self.lang + '/noquerys_' + file_suffix, 'ab') as noqueryfile:
                for line in inf:
                    text = self.line_process(line.strip(), self.infile)
                    if text:
                        of.write(text + '\n')
                    else:
                        noqueryfile.write(line.strip() + '\n')

        elif self.pro_type in ['block', 'recall']:
            if len(self.operators):
                self.operators[0](self.infile, self.outfile)
        elif self.pro_type == 'analysis':
            if len(self.operators):
                with open(self.infile) as inf, open(self.outfile, 'ab') as of:
                    for line in inf:
                        text = self.analysis_process(line)
                        of.write(text.strip() + '\n')
        elif self.pro_type == 'stem':
            self.load_stem()
            self.open_temps = []
            self.in_files = []
            self.out_files = []
            self.args = []

            for i in xrange(self.cpu_num):
                f = open(mid_datadir + self.lang + '/tempin' + str(i), 'wb')
                self.open_temps.append(f)
                self.in_files.append(
                    mid_datadir + self.lang + '/tempin' + str(i))
                self.out_files.append(
                    mid_datadir + self.lang + '/tempout' + str(i))
            for i in xrange(self.cpu_num):
                self.args.append(
                    (self.in_files[i], self.out_files[i], self.lang, self.stems, self.operators[0]))
            with open(self.infile) as f:
                for i, line in enumerate(f):
                    self.open_temps[i %
                                    self.cpu_num].write(line.strip() + '\n')
            for i in xrange(self.cpu_num):
                self.open_temps[i].close()

            pool = multiprocessing.Pool(self.cpu_num)
            pool.map(inner_process, self.args)
            sh.cat(self.out_files, _out=self.outfile)
            sh.rm('-rf', self.in_files)
            sh.rm('-rf', self.out_files)
        elif self.pro_type == 'dictgen':
            tmp1 = mid_datadir + self.lang + '/temp1'
            tmp2 = mid_datadir + self.lang + '/temp2'
            tmp3 = mid_datadir + self.lang + '/temp3'
            sh.cat(self.infile, _out=tmp1)

            with open(tmp1) as inf, open(tmp2, 'wb') as outf:
                for line in inf:
                    text = line.strip().split('\t')
                    outf.write(
                        '\t'.join([text[0], gentime_type[text[1]][0], text[2], gentime_type[text[1]][1], '\n']))
            sh.iconv('-f', 'utf8', '-t', 'gb18030', tmp2, _out=tmp3)
            sh.createbin(
                '-n', 'temp3', '-N', 'fanshixiao_acdict', '-p', mid_datadir +
                self.lang + '/',
                '-P', dict_dir + self.lang + '/', '-f', '%z%d%d%d', '-k', '0', '-t', '1', '-m', '10000000')
            sh.rm('-rf', [tmp3, tmp1])
            sh.mv(tmp2, self.outfile)
def extract_data(input_file_name, output_file_names):
    sh.unzip("-o", input_file_name)

    # This also updates timestamps.  Ruffus doesn't recognize these files as complete results unles the
    # timestamp is up to date.
    sh.mv("testdata.manual.2009.06.14.csv", "sentiment140.test.csv")
    sh.mv("training.1600000.processed.noemoticon.csv", "sentiment140.train.csv")

    # Re-encode the files as utf8.  They look like utf8 already (e.g. file thinks they're utf8)
    # but they are actually encoded as latin1.  This doesn't make a difference for the test data
    # (the utf8 and latin1 encoded test data are identical files) but the train data has some
    # byte sequences that are invalid utf8 and this makes simplejson really upset.
    for output_file in output_file_names:
        sh.mv(output_file, "temp")
        sh.iconv("-f", "latin1", "-t", "utf8", "temp", _out=output_file)
        sh.rm("temp")
示例#5
0
def strip_invalid_utf8(str):
    return sh.iconv(str, "-c", "-t", "UTF-8")
示例#6
0
def strip_invalid_utf8(str):
    return sh.iconv(str, "-c", "-t", "UTF-8")