def batch_loader(finput, fref, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize): rsi = [] rsr = [] rst = [] nd = maxlen = mlen_i = mlen_r = 0 _bsize = bsize for i_d, rd, td in zip(list_reader(finput), list_reader(fref), line_reader(ftarget)): lid = len(i_d) lrd = len(rd) lgth = lid + lrd if maxlen == 0: maxlen = lgth + min(maxpad, lgth // maxpart + 1) _bsize = get_bsize(maxlen, maxtoken, bsize) if (nd < minbsize) or (lgth <= maxlen and nd < _bsize): rsi.append(i_d) rsr.append(rd) rst.append(float(td)) if lid > mlen_i: mlen_i = lid if lrd > mlen_r: mlen_r = lrd nd += 1 else: yield rsi, rsr, rst, mlen_i, mlen_r rsi = [i_d] rsr = [rd] rst = [float(td)] mlen_i = lid mlen_r = lrd maxlen = lgth + min(maxpad, lgth // maxpart + 1) _bsize = get_bsize(maxlen, maxtoken, bsize) nd = 1 if rsi: yield rsi, rsr, rst, mlen_i, mlen_r
def batch_loader(finput, bsize, maxpad, maxpart, maxtoken, minbsize): rsi = [] nd = maxlen = minlen = mlen_i = 0 _bsize = bsize for i_d in list_reader(finput): lgth = len(i_d) if maxlen == 0: _maxpad = max(1, min(maxpad, lgth // maxpart + 1) // 2) maxlen = lgth + _maxpad minlen = lgth - _maxpad _bsize = get_bsize(maxlen, maxtoken, bsize) if (nd < minbsize) or (lgth <= maxlen and lgth >= minlen and nd < _bsize): rsi.append(i_d) if lgth > mlen_i: mlen_i = lgth nd += 1 else: yield rsi, mlen_i rsi = [i_d] mlen_i = lgth _maxpad = max(1, min(maxpad, lgth // maxpart + 1) // 2) maxlen = lgth + _maxpad minlen = lgth - _maxpad _bsize = get_bsize(maxlen, maxtoken, bsize) nd = 1 if rsi: yield rsi, mlen_i
def batch_loader(finput, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize): rsi = [] rst = [] nd = maxlen = mlen_i = mlen_t = 0 for i_d, td in zip(list_reader(finput), list_reader(ftarget)): lid = len(i_d) ltd = len(td) lgth = lid + ltd if maxlen == 0: maxlen = lgth + min(maxpad, lgth // maxpart + 1) _bsize = get_bsize(maxlen, maxtoken, bsize) if (nd < minbsize) or (lgth <= maxlen and nd < _bsize): rsi.append(i_d) rst.append(td) if lid > mlen_i: mlen_i = lid if ltd > mlen_t: mlen_t = ltd nd += 1 else: yield rsi, rst, mlen_i, mlen_t rsi = [i_d] rst = [td] mlen_i = lid mlen_t = ltd maxlen = lgth + min(maxpad, lgth // maxpart + 1) _bsize = get_bsize(maxlen, maxtoken, bsize) nd = 1 if rsi: yield rsi, rst, mlen_i, mlen_t
def batch_loader(finput, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize): rsi = [] rst = [] nd = maxlen = minlen = mlen_i = mlen_t = nsent = 0 _bsize = bsize for (i_d, i_lgth), (td, t_lgth) in zip(doc_reader(finput), doc_reader(ftarget)): cur_nsent = len(i_d) lgth = i_lgth + t_lgth if maxlen == 0: _maxpad = max(1, min(maxpad, lgth // maxpart + 1) // 2) maxlen = lgth + _maxpad minlen = lgth - _maxpad _bsize = max(1, get_bsize(maxlen, maxtoken, bsize) // cur_nsent) if nsent == 0: nsent = cur_nsent if (cur_nsent == nsent) and ( (nd < minbsize) or (lgth <= maxlen and lgth >= minlen and nd < _bsize)): rsi.append(i_d) rst.append(td) if i_lgth > mlen_i: mlen_i = i_lgth if t_lgth > mlen_t: mlen_t = t_lgth nd += 1 else: yield rsi, rst, mlen_i, mlen_t, nsent rsi = [i_d] rst = [td] mlen_i = i_lgth mlen_t = t_lgth nsent = cur_nsent _maxpad = max(1, min(maxpad, lgth // maxpart + 1) // 2) maxlen = lgth + _maxpad minlen = lgth - _maxpad _bsize = max(1, get_bsize(maxlen, maxtoken, bsize) // cur_nsent) nd = 1 if rsi: yield rsi, rst, mlen_i, mlen_t, nsent
def batch_loader(finput, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize): _f_maxpart = float(maxpart) rsi = [] rst = [] rstask = None nd = maxlen = mlen_i = mlen_t = 0 for i_d, td in zip(list_reader(finput), list_reader(ftarget)): lid = len(i_d) - 1 ltd = len(td) lgth = lid + ltd _task = i_d[0] # uncomment the following 2 lines to filter out empty data (e.g. in OPUS-100). #if (lid <= 0) or (ltd <= 0): #continue if maxlen == 0: maxlen = lgth + min(maxpad, ceil(lgth / _f_maxpart)) _bsize = get_bsize(maxlen, maxtoken, bsize) rstask = _task if (rstask == _task) and ((nd < minbsize) or (lgth <= maxlen and nd < _bsize)): rsi.append(i_d[1:]) rst.append(td) if lid > mlen_i: mlen_i = lid if ltd > mlen_t: mlen_t = ltd nd += 1 else: yield rsi, rst, rstask, mlen_i, mlen_t rsi = [i_d[1:]] rstask = _task rst = [td] mlen_i = lid mlen_t = ltd maxlen = lgth + min(maxpad, ceil(lgth / _f_maxpart)) _bsize = get_bsize(maxlen, maxtoken, bsize) nd = 1 if rsi: yield rsi, rst, rstask, mlen_i, mlen_t
def batch_loader(finput, fmt, ftarget, bsize, maxpad, maxpart, maxtoken, minbsize): _f_maxpart = float(maxpart) rsi = [] rsm = [] rst = [] nd = maxlen = mlen_i = mlen_m = mlen_t = 0 for i_d, md, td in zip(list_reader(finput), list_reader(fmt), list_reader(ftarget)): lid = len(i_d) lmd = len(md) ltd = len(td) lgth = lid + lmd + ltd if maxlen == 0: maxlen = lgth + min(maxpad, ceil(lgth / _f_maxpart)) _bsize = get_bsize(maxlen, maxtoken, bsize) if (nd < minbsize) or (lgth <= maxlen and nd < _bsize): rsi.append(i_d) rsm.append(md) rst.append(td) if lid > mlen_i: mlen_i = lid if lmd > mlen_m: mlen_m = lmd if ltd > mlen_t: mlen_t = ltd nd += 1 else: yield rsi, rsm, rst, mlen_i, mlen_m, mlen_t rsi = [i_d] rsm = [md] rst = [td] mlen_i = lid mlen_m = lmd mlen_t = ltd maxlen = lgth + min(maxpad, ceil(lgth / _f_maxpart)) _bsize = get_bsize(maxlen, maxtoken, bsize) nd = 1 if rsi: yield rsi, rsm, rst, mlen_i, mlen_m, mlen_t
def batch_loader(finput, bsize, maxpad, maxpart, maxtoken, minbsize): _f_maxpart = float(maxpart) rsi = [] rstask = None nd = maxlen = minlen = mlen_i = 0 for i_d in list_reader(finput): lgth = len(i_d) - 1 _task = i_d[0] #if lgth <= 0: #continue if maxlen == 0: _maxpad = max(1, min(maxpad, ceil(lgth / _f_maxpart)) // 2) maxlen = lgth + _maxpad minlen = lgth - _maxpad _bsize = get_bsize(maxlen, maxtoken, bsize) rstask = _task if (rstask == _task) and ( (nd < minbsize) or (lgth <= maxlen and lgth >= minlen and nd < _bsize)): rsi.append(i_d[1:]) if lgth > mlen_i: mlen_i = lgth nd += 1 else: yield rsi, rstask, mlen_i rsi = [i_d[1:]] rstask = _task mlen_i = lgth _maxpad = max(1, min(maxpad, ceil(lgth / _f_maxpart)) // 2) maxlen = lgth + _maxpad minlen = lgth - _maxpad _bsize = get_bsize(maxlen, maxtoken, bsize) nd = 1 if rsi: yield rsi, rstask, mlen_i