Пример #1
0
def SingleProcess(corpusRootPath, lstmOutputRootPath, clas, language,
                  wordDimension, corpusType):

    representationDim = 50
    corpusPath = corpusRootPath
    lstmOutputPath = lstmOutputRootPath + corpusType + "/"

    branchPath = str(wordDimension) + "d/" + language + "/" + clas + "/"
    if (not os.path.exists(lstmOutputPath + branchPath)):
        os.makedirs(lstmOutputPath + branchPath)
    dictPath = corpusPath + language + "/" + corpusType + "_" + clas + "_new.txt.extract_" + str(
        wordDimension) + ".lstmDict"
    seriFilePath = corpusPath + language + "/" + corpusType + "_" + clas + "_new.txt.extract_" + str(
        wordDimension) + ".serialization"
    labelFilePath = corpusPath + language + "/" + corpusType + "_" + clas + "_new.txt.label"
    datasetPath = lstmOutputPath + branchPath + clas + "_dataSet" + str(
        wordDimension) + ".pkl"

    if (not os.path.exists(datasetPath)):
        preprocess(seriFilePath,
                   labelFilePath,
                   datasetPath,
                   wordDimension,
                   batch_size=10,
                   dictCount=wc(dictPath))
    if (not os.path.exists(lstmOutputPath + branchPath +
                           '/test_proj_best.txt')):
        train_lstm(lstmOutPutRootPath=lstmOutputPath + branchPath,
                   dictPath=dictPath,
                   dataSetPath=datasetPath,
                   dim_proj=wordDimension,
                   n_words=wc(dictPath),
                   max_epochs=30,
                   test_size=wc(seriFilePath))
    numberFile = corpusPath + language + "/" + corpusType + "_" + clas + "_new.txt.number"
    fragmentVectorFile = lstmOutputPath + str(
        wordDimension) + "d/" + language + "/" + clas + "/test_proj_best.txt"
    indexFile = lstmOutputPath + str(
        wordDimension
    ) + "d/" + language + "/" + clas + "/" + "" + corpusType + "_" + clas + "_new.txt.index"
    sentenceVectorFile = lstmOutputPath + str(
        wordDimension
    ) + "d/" + language + "/" + clas + "/" + "" + corpusType + "_" + clas + "_new.txt.sent"
    genSentenceVector(numberFile, fragmentVectorFile, indexFile,
                      sentenceVectorFile, representationDim)

    branchPath = str(wordDimension) + "d/" + language + "/" + clas + "/"
    indexFile = lstmOutputPath + branchPath + corpusType + "_" + clas + "_new.txt.index"
    sentFile = lstmOutputPath + branchPath + corpusType + "_" + clas + "_new.txt.sent"
    numberFile = corpusPath + language + "/" + corpusType + "_" + clas + "_new.txt.number"
    newSentFile = lstmOutputPath + branchPath + clas + "_" + corpusType + "_embed_" + str.upper(
        language) + ".sent"
    newindexFile = lstmOutputPath + branchPath + clas + "_" + corpusType + "_index_" + str.upper(
        language) + ".sent"
    AddZerosVectorToSent(indexFile, sentFile, numberFile, newSentFile,
                         newindexFile, representationDim)
Пример #2
0
def make(f_in=sys.stdin, f_out=sys.stdout):
	if f_in==sys.stdin:
		f_temp=tempfile.NamedTemporaryFile(mode='w', delete=False)
		f_temp_name=f_temp.name
		for line in f_in:
			f_temp.write(line)
		f_temp.close()
	else:
		f_temp_name=f_in.name
	#pdb.set_trace()
	linecount=wc.wc(open(f_temp_name), 'lines')
	linecount=int(linecount)-1
	# determine number of blank lines
	f_in=open(f_temp_name, 'r')
	for line in f_in:
		if len(line)==0:
			linecount-=1
	f_in.close()
	f_out.write('#2.1')
	f_in=open(f_temp_name, 'r')
	num_cols=0
	for line in f_in:
		cols=line.strip().split('\t')
		if num_cols==0:
			num_cols=len(cols)
			f_out.write('\n' + '\t'.join([str(linecount), str(num_cols-5)]))
			f_out.write('\n' + '\t'.join(cols[-2:] + cols[3:-2]))
		else:
			f_out.write('\n' + '\t'.join(cols[-2:]) + ' |@chr' + cols[0] + ':' + cols[1] + '-' + cols[2] + '|')
			f_out.write('\t'.join([''] + cols[3:-2]))
			lc=len(cols)
			if lc<num_cols:
				pdb.set_trace()
				f_out.write('\t'.join(['']*(num_cols-lc+1)))
Пример #3
0
def test_wc_on_real_py_file(tmp_path):
    f = tmp_path / "driving.py"
    urlretrieve(py_file, f)
    output = wc(f.resolve())
    counts = ' '.join(output.split()[:3])
    # https://twitter.com/pybites/status/1175795375904628736
    expected = "7 29 216"  # not 8!
    assert counts == expected
    assert f.name in output
Пример #4
0
def test_wc(some_text, expected, tmp_path):
    f = tmp_path / "some_file.txt"
    f.write_bytes(some_text)
    output = wc(f.resolve())
    # replace tabs / multiple spaces by single space
    counts = ' '.join(output.split()[:3])
    assert counts == expected
    # file with/without path allowed
    assert f.name in output
def preprocess(trainVectorFilePath,
               trainLabelFilePath,
               testVectorFilePath,
               testLabelFilePath,
               dataSetPath,
               wordDimension,
               batch_size=200):

    print('Converting data format...')
    train_data_x = []
    train_data_y = []
    test_data_x = []
    test_data_y = []
    # train_vec_file = open("H:/CNN_YANG/"+embedName+"/train_"+str(wordDimension+posDimension)+".txt", 'r')
    # train_label_file = open("H:/CNN_YANG/"+embedName+"/train_label_"+str(wordDimension+posDimension)+".txt", 'r')

    #############################################
    AddVectorCountToTimesOfBatch_size(trainVectorFilePath, batch_size,
                                      wordDimension)
    AddLabelCountToTimesOfBatch_size(trainLabelFilePath, batch_size)
    #############################################
    train_vec_file = open(trainVectorFilePath, 'r')
    train_label_file = open(trainLabelFilePath, 'r')
    # train_vec_file=open("yuliao/book/outEmbedding_Trim.txt", 'r')
    # train_label_file = open("yuliao/book/dvd_label.txt", 'r')
    vectorCount = wc(trainVectorFilePath)

    for i in range(0, vectorCount):
        vec_line = train_vec_file.readline().strip()

        label_line = train_label_file.readline().strip()
        train_data_x.append([float(elem) for elem in vec_line.split(' ')])
        if label_line[0] == '1':
            train_data_y.extend([1])
        else:
            train_data_y.extend([0])

    test_vec_file = open(testVectorFilePath, 'r')
    test_label_file = open(testLabelFilePath, 'r')

    for i in range(0, vectorCount):
        vec_line = test_vec_file.readline().strip()
        label_line = test_label_file.readline().strip()
        test_data_x.append([float(elem) for elem in vec_line.split(' ')])
        if label_line[0] == '1':
            test_data_y.extend([1])
        else:
            test_data_y.extend([0])

    output_file = open(dataSetPath, 'wb')

    train_data = [train_data_x, train_data_y]
    test_data = [test_data_x, test_data_y]
    cPickle.dump(train_data, output_file)
    cPickle.dump(test_data, output_file)

    output_file.close()
Пример #6
0
def test_wc(some_text, expected, tmp_path):
    f = tmp_path / "some_file.txt"
    f.write_bytes(some_text)
    output = wc(f.resolve())
    # replace tabs / multiple spaces by single space
    output = re.sub(r"\t|\s+", " ", output)

    assert expected in output
    # file with/without path allowed
    assert f.name in output
Пример #7
0
            def register_update_closure():
                stream_id = stream_id_build(account, call)

                logging.debug("Registering %s (%d updates) for %s"
                              % ("success" if success else "failure",
                                 len(statuses), stream_id))

                try:
                    if success:
                        wc().stream_updated(
                            stream_id,
                            new_objects=len(statuses),
                            objects_inline=len(statuses))
                    else:
                        wc().stream_update_failed(
                            stream_id,
                            reason=woodchuck.TransferStatus.FailureGone)
                except Exception, e:
                    logging.exception(
                        "Registering update of %s with Woodchuck: %s"
                        % (stream_id, str(e)))
                    return
Пример #8
0
def WordCount(SeriPath, type, category, dimension):
    #######################
    filenames = []
    filenames.append(SeriPath + type + "_test_" + category + "_en.txt")
    filenames.append(SeriPath + type + "_test_" + category + "_cn.txt")
    filenames.append(SeriPath + type + "_train_" + category + "_en.txt")
    filenames.append(SeriPath + type + "_train_" + category + "_cn.txt")
    wordsnumber = 0
    maxLen = 0
    wordsnumber = wc(SeriPath + type + "_" + category + "_dict_" +
                     str(dimension) + ".txt")
    for filename in filenames:
        tmpLen = maxWordLen(filename)
        if (maxLen < tmpLen):
            maxLen = tmpLen
    print(wordsnumber)
    return wordsnumber, maxLen
Пример #9
0
    filename = os.path.join(post_path,
                            str(time.time()) + '-' + str(random.random()))
    with open(filename, 'wb') as fhandle:
        post = {
            'shorten_url': shorten_url,
            'serialize': serialize,
            'text': text,
            'latitude': latitude,
            'longitude': longitude,
            'base_url': base_url,
            'action': action,
            'tweet_id': tweet_id,
        }
        pickle.dump(post, fhandle, pickle.HIGHEST_PROTOCOL)

    # Register the post with Woodchuck.
    if wc().available():
        try:
            if len(text) > 25:
                human_readable_name = text[:23] + "..."
            else:
                human_readable_name = text

            wc()['topost'].object_register(
                object_identifier=os.path.basename(filename),
                human_readable_name=human_readable_name,
                expected_size=-1 * os.path.getsize(filename))
        except Exception:
            logging.exception("Registering post %s with Woodchuck" %
                              (filename, ))
Пример #10
0
def AddLabelCountToTimesOfBatch_size(fileName, batch_size):
    count = wc(fileName)
    countNeedToAdd = (batch_size - (count % batch_size))%batch_size
    writeNonsenceLabel(fileName, countNeedToAdd)
Пример #11
0
def AddVectorCountToTimesOfBatch_size(fileName, batch_size, wordDimension):
    count = wc(fileName)
    countNeedToAdd = (batch_size - (count % batch_size))%batch_size
    writeNonsenceVector(fileName, countNeedToAdd, wordDimension)
Пример #12
0
 def test_with_nonExisting_file(self):
     with self.assertRaises(FileNotFoundError):
         wc("lines", str(uuid4))
Пример #13
0
 def test_lines(self):
     self.assertEqual(wc("lines", self.test_file), 2)
Пример #14
0
 def test_words(self):
     self.assertEqual(wc("words", self.test_file), 11)
Пример #15
0
        def register_update(account, call, status_id_prefix, success, statuses):
            # The closure will be run in the main thread
            # asynchronously and after this thread is destroyed. Do
            # NOT access the self variable.
            def register_update_closure():
                stream_id = stream_id_build(account, call)

                logging.debug("Registering %s (%d updates) for %s"
                              % ("success" if success else "failure",
                                 len(statuses), stream_id))

                try:
                    if success:
                        wc().stream_updated(
                            stream_id,
                            new_objects=len(statuses),
                            objects_inline=len(statuses))
                    else:
                        wc().stream_update_failed(
                            stream_id,
                            reason=woodchuck.TransferStatus.FailureGone)
                except Exception, e:
                    logging.exception(
                        "Registering update of %s with Woodchuck: %s"
                        % (stream_id, str(e)))
                    return

                for status in statuses:
                    try:
                        poster = status.user.screen_name
                    except Exception:
                        poster = status.sender_screen_name

                    text = ""
                    try:
                        text = status.text
                        if len(text) > 25:
                            text = text[:22] + "..."
                    except Exception:
                        pass

                    human_readable_name = "%s: %s" % (poster, text)

                    filename = ""
                    size = None
                    try:
                        filename = status_id_prefix + str(status.id)
                        size = os.path.getsize(filename)
                    except Exception, e:
                        logging.exception("Getting size of %s: %s"
                                          % (filename, str(e)))

                    try:
                        wc()[stream_id].object_register(
                            object_identifier=str(status.id),
                            human_readable_name=human_readable_name,
                            expected_size=size)
                        wc()[stream_id][str(status.id)].transferred(
                            object_size=size)
                    except Exception, e:
                        logging.exception(
                            "Registering transfer of %s (%s) (in %s): %s"
                            % (human_readable_name, status.id,
                               stream_id, str(e)))
Пример #16
0
 def test_with_nonExisting_file(self):
     with self.assertRaises(FileNotFoundError):
         wc("lines", str(uuid4))
Пример #17
0
 def test_lines(self):
     self.assertEqual(wc("lines", self.test_file), 2)
Пример #18
0
 def test_words(self):
     self.assertEqual(wc("words", self.test_file), 11)
Пример #19
0
 def test_chars(self):
     self.assertEqual(wc("chars", self.test_file),
                      len(self.contents))
Пример #20
0
    filename = os.path.join(
        post_path, str(time.time()) + '-' + str (random.random()))
    with open(filename, 'wb') as fhandle:
        post = {
            'shorten_url': shorten_url,
            'serialize': serialize,
            'text': text,
            'latitude': latitude,
            'longitude': longitude,
            'base_url': base_url,
            'action': action,
            'tweet_id': tweet_id,
            }
        pickle.dump(post, fhandle, pickle.HIGHEST_PROTOCOL)

    # Register the post with Woodchuck.
    if wc().available():
        try:
            if len(text) > 25:
                human_readable_name = text[:23] + "..."
            else:
                human_readable_name = text

            wc()['topost'].object_register(
                object_identifier=os.path.basename(filename),
                human_readable_name=human_readable_name,
                expected_size=-1 * os.path.getsize(filename))
        except Exception:
            logging.exception("Registering post %s with Woodchuck"
                              % (filename,))
Пример #21
0
def preprocess(seriFilePath,
               labelFilePath,
               dataSetPath,
               wordDimension,
               batch_size=10,
               dictCount=1000):
    #def preprocess(seriFilePath_train = seriFilePath_train,
    #               seriFilePath_test = seriFilePath_test,
    #               labelFilePath_train = labelFilePath_train,
    #               labelFilePath_test = labelFilePath_test,
    #               dataSetPath = datasetPath,
    #               wordDimension = wordDimension,
    #               batch_size = 200)
    print('Converting data format...')
    train_data_x = []
    train_data_y = []
    test_data_x = []
    test_data_y = []
    # train_vec_file = open("H:/CNN_YANG/"+embedName+"/train_"+str(wordDimension+posDimension)+".txt", 'r')
    # train_label_file = open("H:/CNN_YANG/"+embedName+"/train_label_"+str(wordDimension+posDimension)+".txt", 'r')

    #############################################
    AddSeriCountToTimesOfBatch_size(seriFilePath, batch_size, wordDimension,
                                    dictCount)
    AddLabelCountToTimesOfBatch_size(labelFilePath, batch_size)
    #############################################
    train_vec_file = open(seriFilePath, 'r')
    train_label_file = open(labelFilePath, 'r')
    # train_vec_file=open("yuliao/book/outEmbedding_Trim.txt", 'r')
    # train_label_file = open("yuliao/book/dvd_label.txt", 'r')
    vectorCount = wc(seriFilePath)

    for i in range(0, vectorCount):
        vec_line = train_vec_file.readline().strip()

        label_line = train_label_file.readline().strip()
        train_data_x.append([float(elem) for elem in vec_line.split(' ')])
        if label_line[0] == '1':
            train_data_y.extend([1])
        else:
            train_data_y.extend([0])

    test_vec_file = open(seriFilePath, 'r')
    test_label_file = open(labelFilePath, 'r')

    for i in range(0, vectorCount):
        vec_line = test_vec_file.readline().strip()
        label_line = test_label_file.readline().strip()
        test_data_x.append([float(elem) for elem in vec_line.split(' ')])
        if label_line[0] == '1':
            test_data_y.extend([1])
        else:
            test_data_y.extend([0])

    output_file = open(dataSetPath, 'wb')

    train_data = [train_data_x, train_data_y]
    test_data = [test_data_x, test_data_y]
    pkl.dump(train_data, output_file)
    pkl.dump(test_data, output_file)

    output_file.close()
Пример #22
0
                            object_size=size)
                    except Exception, e:
                        logging.exception(
                            "Registering transfer of %s (%s) (in %s): %s"
                            % (human_readable_name, status.id,
                               stream_id, str(e)))

                    logging.debug("Registered update of %s with Woodchuck"
                                  % (stream_id,))

                logging.debug("Registered %s (%d updates) for %s"
                              % ("success" if success else "failure",
                                 len(statuses), stream_id))

            return register_update_closure

        if wc().available():
            logging.debug("Wookchuck available: queued update")
            mainthread.execute(
                register_update(
                    self.account, self.call, self.statusIdFilename(""),
                    success, statuses),
                async=True)
        else:
            logging.debug("Wookchuck NOT available: not registering updates")

        settings.sync()
        logging.debug('%s finished' % self.call)


Пример #23
0
from wc import wc

a = wc()
f1 = "/usr/share/dict/words"
a.wc_show("/home/SWT-LABOR/1810226/nethome/PYP/19pyth10/PyPrgs/b05/wc.py")
a.set_lang("en")
a.wc_show(f1)
Пример #24
0
 def test_chars(self):
     self.assertEqual(wc("chars", self.test_file), len(self.contents))