def SingleProcess(corpusRootPath, lstmOutputRootPath, clas, language, wordDimension, corpusType): representationDim = 50 corpusPath = corpusRootPath lstmOutputPath = lstmOutputRootPath + corpusType + "/" branchPath = str(wordDimension) + "d/" + language + "/" + clas + "/" if (not os.path.exists(lstmOutputPath + branchPath)): os.makedirs(lstmOutputPath + branchPath) dictPath = corpusPath + language + "/" + corpusType + "_" + clas + "_new.txt.extract_" + str( wordDimension) + ".lstmDict" seriFilePath = corpusPath + language + "/" + corpusType + "_" + clas + "_new.txt.extract_" + str( wordDimension) + ".serialization" labelFilePath = corpusPath + language + "/" + corpusType + "_" + clas + "_new.txt.label" datasetPath = lstmOutputPath + branchPath + clas + "_dataSet" + str( wordDimension) + ".pkl" if (not os.path.exists(datasetPath)): preprocess(seriFilePath, labelFilePath, datasetPath, wordDimension, batch_size=10, dictCount=wc(dictPath)) if (not os.path.exists(lstmOutputPath + branchPath + '/test_proj_best.txt')): train_lstm(lstmOutPutRootPath=lstmOutputPath + branchPath, dictPath=dictPath, dataSetPath=datasetPath, dim_proj=wordDimension, n_words=wc(dictPath), max_epochs=30, test_size=wc(seriFilePath)) numberFile = corpusPath + language + "/" + corpusType + "_" + clas + "_new.txt.number" fragmentVectorFile = lstmOutputPath + str( wordDimension) + "d/" + language + "/" + clas + "/test_proj_best.txt" indexFile = lstmOutputPath + str( wordDimension ) + "d/" + language + "/" + clas + "/" + "" + corpusType + "_" + clas + "_new.txt.index" sentenceVectorFile = lstmOutputPath + str( wordDimension ) + "d/" + language + "/" + clas + "/" + "" + corpusType + "_" + clas + "_new.txt.sent" genSentenceVector(numberFile, fragmentVectorFile, indexFile, sentenceVectorFile, representationDim) branchPath = str(wordDimension) + "d/" + language + "/" + clas + "/" indexFile = lstmOutputPath + branchPath + corpusType + "_" + clas + "_new.txt.index" sentFile = lstmOutputPath + branchPath + corpusType + "_" + clas + "_new.txt.sent" numberFile = corpusPath + language + "/" + corpusType + "_" + clas + "_new.txt.number" newSentFile = lstmOutputPath + branchPath + clas + "_" + corpusType + "_embed_" + str.upper( language) + ".sent" newindexFile = lstmOutputPath + branchPath + clas + "_" + corpusType + "_index_" + str.upper( language) + ".sent" AddZerosVectorToSent(indexFile, sentFile, numberFile, newSentFile, newindexFile, representationDim)
def make(f_in=sys.stdin, f_out=sys.stdout): if f_in==sys.stdin: f_temp=tempfile.NamedTemporaryFile(mode='w', delete=False) f_temp_name=f_temp.name for line in f_in: f_temp.write(line) f_temp.close() else: f_temp_name=f_in.name #pdb.set_trace() linecount=wc.wc(open(f_temp_name), 'lines') linecount=int(linecount)-1 # determine number of blank lines f_in=open(f_temp_name, 'r') for line in f_in: if len(line)==0: linecount-=1 f_in.close() f_out.write('#2.1') f_in=open(f_temp_name, 'r') num_cols=0 for line in f_in: cols=line.strip().split('\t') if num_cols==0: num_cols=len(cols) f_out.write('\n' + '\t'.join([str(linecount), str(num_cols-5)])) f_out.write('\n' + '\t'.join(cols[-2:] + cols[3:-2])) else: f_out.write('\n' + '\t'.join(cols[-2:]) + ' |@chr' + cols[0] + ':' + cols[1] + '-' + cols[2] + '|') f_out.write('\t'.join([''] + cols[3:-2])) lc=len(cols) if lc<num_cols: pdb.set_trace() f_out.write('\t'.join(['']*(num_cols-lc+1)))
def test_wc_on_real_py_file(tmp_path): f = tmp_path / "driving.py" urlretrieve(py_file, f) output = wc(f.resolve()) counts = ' '.join(output.split()[:3]) # https://twitter.com/pybites/status/1175795375904628736 expected = "7 29 216" # not 8! assert counts == expected assert f.name in output
def test_wc(some_text, expected, tmp_path): f = tmp_path / "some_file.txt" f.write_bytes(some_text) output = wc(f.resolve()) # replace tabs / multiple spaces by single space counts = ' '.join(output.split()[:3]) assert counts == expected # file with/without path allowed assert f.name in output
def preprocess(trainVectorFilePath, trainLabelFilePath, testVectorFilePath, testLabelFilePath, dataSetPath, wordDimension, batch_size=200): print('Converting data format...') train_data_x = [] train_data_y = [] test_data_x = [] test_data_y = [] # train_vec_file = open("H:/CNN_YANG/"+embedName+"/train_"+str(wordDimension+posDimension)+".txt", 'r') # train_label_file = open("H:/CNN_YANG/"+embedName+"/train_label_"+str(wordDimension+posDimension)+".txt", 'r') ############################################# AddVectorCountToTimesOfBatch_size(trainVectorFilePath, batch_size, wordDimension) AddLabelCountToTimesOfBatch_size(trainLabelFilePath, batch_size) ############################################# train_vec_file = open(trainVectorFilePath, 'r') train_label_file = open(trainLabelFilePath, 'r') # train_vec_file=open("yuliao/book/outEmbedding_Trim.txt", 'r') # train_label_file = open("yuliao/book/dvd_label.txt", 'r') vectorCount = wc(trainVectorFilePath) for i in range(0, vectorCount): vec_line = train_vec_file.readline().strip() label_line = train_label_file.readline().strip() train_data_x.append([float(elem) for elem in vec_line.split(' ')]) if label_line[0] == '1': train_data_y.extend([1]) else: train_data_y.extend([0]) test_vec_file = open(testVectorFilePath, 'r') test_label_file = open(testLabelFilePath, 'r') for i in range(0, vectorCount): vec_line = test_vec_file.readline().strip() label_line = test_label_file.readline().strip() test_data_x.append([float(elem) for elem in vec_line.split(' ')]) if label_line[0] == '1': test_data_y.extend([1]) else: test_data_y.extend([0]) output_file = open(dataSetPath, 'wb') train_data = [train_data_x, train_data_y] test_data = [test_data_x, test_data_y] cPickle.dump(train_data, output_file) cPickle.dump(test_data, output_file) output_file.close()
def test_wc(some_text, expected, tmp_path): f = tmp_path / "some_file.txt" f.write_bytes(some_text) output = wc(f.resolve()) # replace tabs / multiple spaces by single space output = re.sub(r"\t|\s+", " ", output) assert expected in output # file with/without path allowed assert f.name in output
def register_update_closure(): stream_id = stream_id_build(account, call) logging.debug("Registering %s (%d updates) for %s" % ("success" if success else "failure", len(statuses), stream_id)) try: if success: wc().stream_updated( stream_id, new_objects=len(statuses), objects_inline=len(statuses)) else: wc().stream_update_failed( stream_id, reason=woodchuck.TransferStatus.FailureGone) except Exception, e: logging.exception( "Registering update of %s with Woodchuck: %s" % (stream_id, str(e))) return
def WordCount(SeriPath, type, category, dimension): ####################### filenames = [] filenames.append(SeriPath + type + "_test_" + category + "_en.txt") filenames.append(SeriPath + type + "_test_" + category + "_cn.txt") filenames.append(SeriPath + type + "_train_" + category + "_en.txt") filenames.append(SeriPath + type + "_train_" + category + "_cn.txt") wordsnumber = 0 maxLen = 0 wordsnumber = wc(SeriPath + type + "_" + category + "_dict_" + str(dimension) + ".txt") for filename in filenames: tmpLen = maxWordLen(filename) if (maxLen < tmpLen): maxLen = tmpLen print(wordsnumber) return wordsnumber, maxLen
filename = os.path.join(post_path, str(time.time()) + '-' + str(random.random())) with open(filename, 'wb') as fhandle: post = { 'shorten_url': shorten_url, 'serialize': serialize, 'text': text, 'latitude': latitude, 'longitude': longitude, 'base_url': base_url, 'action': action, 'tweet_id': tweet_id, } pickle.dump(post, fhandle, pickle.HIGHEST_PROTOCOL) # Register the post with Woodchuck. if wc().available(): try: if len(text) > 25: human_readable_name = text[:23] + "..." else: human_readable_name = text wc()['topost'].object_register( object_identifier=os.path.basename(filename), human_readable_name=human_readable_name, expected_size=-1 * os.path.getsize(filename)) except Exception: logging.exception("Registering post %s with Woodchuck" % (filename, ))
def AddLabelCountToTimesOfBatch_size(fileName, batch_size): count = wc(fileName) countNeedToAdd = (batch_size - (count % batch_size))%batch_size writeNonsenceLabel(fileName, countNeedToAdd)
def AddVectorCountToTimesOfBatch_size(fileName, batch_size, wordDimension): count = wc(fileName) countNeedToAdd = (batch_size - (count % batch_size))%batch_size writeNonsenceVector(fileName, countNeedToAdd, wordDimension)
def test_with_nonExisting_file(self): with self.assertRaises(FileNotFoundError): wc("lines", str(uuid4))
def test_lines(self): self.assertEqual(wc("lines", self.test_file), 2)
def test_words(self): self.assertEqual(wc("words", self.test_file), 11)
def register_update(account, call, status_id_prefix, success, statuses): # The closure will be run in the main thread # asynchronously and after this thread is destroyed. Do # NOT access the self variable. def register_update_closure(): stream_id = stream_id_build(account, call) logging.debug("Registering %s (%d updates) for %s" % ("success" if success else "failure", len(statuses), stream_id)) try: if success: wc().stream_updated( stream_id, new_objects=len(statuses), objects_inline=len(statuses)) else: wc().stream_update_failed( stream_id, reason=woodchuck.TransferStatus.FailureGone) except Exception, e: logging.exception( "Registering update of %s with Woodchuck: %s" % (stream_id, str(e))) return for status in statuses: try: poster = status.user.screen_name except Exception: poster = status.sender_screen_name text = "" try: text = status.text if len(text) > 25: text = text[:22] + "..." except Exception: pass human_readable_name = "%s: %s" % (poster, text) filename = "" size = None try: filename = status_id_prefix + str(status.id) size = os.path.getsize(filename) except Exception, e: logging.exception("Getting size of %s: %s" % (filename, str(e))) try: wc()[stream_id].object_register( object_identifier=str(status.id), human_readable_name=human_readable_name, expected_size=size) wc()[stream_id][str(status.id)].transferred( object_size=size) except Exception, e: logging.exception( "Registering transfer of %s (%s) (in %s): %s" % (human_readable_name, status.id, stream_id, str(e)))
def test_chars(self): self.assertEqual(wc("chars", self.test_file), len(self.contents))
filename = os.path.join( post_path, str(time.time()) + '-' + str (random.random())) with open(filename, 'wb') as fhandle: post = { 'shorten_url': shorten_url, 'serialize': serialize, 'text': text, 'latitude': latitude, 'longitude': longitude, 'base_url': base_url, 'action': action, 'tweet_id': tweet_id, } pickle.dump(post, fhandle, pickle.HIGHEST_PROTOCOL) # Register the post with Woodchuck. if wc().available(): try: if len(text) > 25: human_readable_name = text[:23] + "..." else: human_readable_name = text wc()['topost'].object_register( object_identifier=os.path.basename(filename), human_readable_name=human_readable_name, expected_size=-1 * os.path.getsize(filename)) except Exception: logging.exception("Registering post %s with Woodchuck" % (filename,))
def preprocess(seriFilePath, labelFilePath, dataSetPath, wordDimension, batch_size=10, dictCount=1000): #def preprocess(seriFilePath_train = seriFilePath_train, # seriFilePath_test = seriFilePath_test, # labelFilePath_train = labelFilePath_train, # labelFilePath_test = labelFilePath_test, # dataSetPath = datasetPath, # wordDimension = wordDimension, # batch_size = 200) print('Converting data format...') train_data_x = [] train_data_y = [] test_data_x = [] test_data_y = [] # train_vec_file = open("H:/CNN_YANG/"+embedName+"/train_"+str(wordDimension+posDimension)+".txt", 'r') # train_label_file = open("H:/CNN_YANG/"+embedName+"/train_label_"+str(wordDimension+posDimension)+".txt", 'r') ############################################# AddSeriCountToTimesOfBatch_size(seriFilePath, batch_size, wordDimension, dictCount) AddLabelCountToTimesOfBatch_size(labelFilePath, batch_size) ############################################# train_vec_file = open(seriFilePath, 'r') train_label_file = open(labelFilePath, 'r') # train_vec_file=open("yuliao/book/outEmbedding_Trim.txt", 'r') # train_label_file = open("yuliao/book/dvd_label.txt", 'r') vectorCount = wc(seriFilePath) for i in range(0, vectorCount): vec_line = train_vec_file.readline().strip() label_line = train_label_file.readline().strip() train_data_x.append([float(elem) for elem in vec_line.split(' ')]) if label_line[0] == '1': train_data_y.extend([1]) else: train_data_y.extend([0]) test_vec_file = open(seriFilePath, 'r') test_label_file = open(labelFilePath, 'r') for i in range(0, vectorCount): vec_line = test_vec_file.readline().strip() label_line = test_label_file.readline().strip() test_data_x.append([float(elem) for elem in vec_line.split(' ')]) if label_line[0] == '1': test_data_y.extend([1]) else: test_data_y.extend([0]) output_file = open(dataSetPath, 'wb') train_data = [train_data_x, train_data_y] test_data = [test_data_x, test_data_y] pkl.dump(train_data, output_file) pkl.dump(test_data, output_file) output_file.close()
object_size=size) except Exception, e: logging.exception( "Registering transfer of %s (%s) (in %s): %s" % (human_readable_name, status.id, stream_id, str(e))) logging.debug("Registered update of %s with Woodchuck" % (stream_id,)) logging.debug("Registered %s (%d updates) for %s" % ("success" if success else "failure", len(statuses), stream_id)) return register_update_closure if wc().available(): logging.debug("Wookchuck available: queued update") mainthread.execute( register_update( self.account, self.call, self.statusIdFilename(""), success, statuses), async=True) else: logging.debug("Wookchuck NOT available: not registering updates") settings.sync() logging.debug('%s finished' % self.call)
from wc import wc a = wc() f1 = "/usr/share/dict/words" a.wc_show("/home/SWT-LABOR/1810226/nethome/PYP/19pyth10/PyPrgs/b05/wc.py") a.set_lang("en") a.wc_show(f1)