def infer_portioning(disc_dir): """Infer the portioning of a game given the glozz files in `disc_dir`. Parameters ---------- disc_dir : string Path to a discourse folder containing one glozz .ac file per portion of the game ; `unannotated` seems the most convenient choice. Returns ------- first_idx : list of parse_turn_id Identifier of the first turn of each portion. """ ac_files = glob(os.path.join(disc_dir, '*.ac')) if not ac_files: err_msg = 'Unable to locate any glozz .ac file in {}'.format(disc_dir) raise ValueError(err_msg) first_idx = [] for ac_file in ac_files: with open(ac_file, 'rb') as ac_file: for line in ac_file: fidx = parse_turn_id(line.split(':', 1)[0].strip()) first_idx.append(fidx) return sorted(first_idx)
def backport_portioning(seg_file, first_idx): """Encode game portioning in `seg_file`, according to `first_idx`. Parameters ---------- seg_file : string Path to the segmented version of the CSV file for the game. first_idx : list of int Identifier of the first turn of each portion. """ seg_file_res = seg_file + '.fut' with open(seg_file, 'rb') as seg_f: with open(seg_file_res, 'w') as seg_f_res: reader = csv.reader(seg_f, delimiter='\t') writer = csv.writer(seg_f_res, delimiter='\t', lineterminator='\n') # leave header line line = reader.next() writer.writerow(line) # regular lines for line in reader: # keep existing empty lines if ((not line or not ''.join(line).strip())): writer.writerow(line) continue # insert an empty line just before a starting turn # (except for the turn starting the first portion) if parse_turn_id(line[0]) in first_idx[1:]: writer.writerow([]) # write the normal line writer.writerow(line) # replace the original segmented file os.rename(seg_file_res, seg_file)
def read_portioning(seg_file): """Read portioning in the segmented csv file. Parameters ---------- seg_file : string TODO Returns ------- first_idx : list of parse_turn_id Identifier of the first turn of each portion. """ first_idx = [] with open(seg_file, 'rb') as seg_f: reader = csv.reader(seg_f, delimiter='\t') # leave header line line = reader.next() grab_next = True # regular lines for line in reader: # if empty line, be ready to start a new portion if ((not line or not ''.join(line).strip())): grab_next = True continue # new portion if grab_next: first_idx.append(parse_turn_id(line[0])) grab_next = False return first_idx
def _transfer_turns(f_orig, f_dest, f_res, verbose=0): """Transfer turns from `f_orig` to `f_dest` to produce `f_res`. Parameters ---------- f_orig : File Version of the file with higher priority (typically, the currently annotated version). f_dest : File Version of the file with lower priority (typically, the unannotated but finer-grained version). f_res : File `f_dest` with turns transfered from `f_orig`. verbose : int Verbosity level. """ reader_orig = csv.reader(f_orig, delimiter='\t') reader_dest = csv.reader(f_dest, delimiter='\t') writer_res = csv.writer(f_res, delimiter='\t', lineterminator='\n') # read and write header header_orig = reader_orig.next() header_dest = reader_dest.next() writer_res.writerow(header_dest) # read and write content for line_dest in reader_dest: # _orig exhausted means new turns in _dest try: line_orig = reader_orig.next() except StopIteration: writer_res.writerow(line_dest) continue # easy case: keep lines that are identical on both sides if line_orig == line_dest: writer_res.writerow(line_dest) continue # otherwise: # * empty lines mark subdoc split: # we need to be careful to avoid splitting inside chunks of # game messages buff_orig = [] while (not line_orig or not ''.join(line_orig).strip()): # TODO? get rid of spurious empty lines # look ahead for the next turn in _orig buff_orig.append(line_orig) line_orig = reader_orig.next() # adjust subdoc split: append extra turns from _dest until # either we reach a safe split point # or all extra turns have been consumed if buff_orig: try: # why try/catch: cf. DEBUG below turn_id_orig = parse_turn_id(line_orig[0]) except ValueError: print([i for i, c in enumerate(line_orig[0]) if c == '\t']) print('\n'.join(line_orig)) print(line_orig[0].split('\t')) raise # new turns in _dest should be appended to the current subdoc, # until we reach "It's X's turn to roll the dice." (or none # remains) try: turn_id_dest = parse_turn_id(line_dest[0]) except ValueError: print([i for i, c in enumerate(line_dest[0]) if c == '\t']) print('\n'.join(line_dest)) print(line_dest[0].split('\t')) raise while (turn_id_dest < turn_id_orig and not ( line_dest[2] == 'Server' and line_dest[5].endswith('turn to roll the dice.'))): writer_res.writerow(line_dest) # read next turn from _dest line_dest = reader_dest.next() try: turn_id_dest = parse_turn_id(line_dest[0]) except ValueError: print([i for i, c in enumerate(line_dest[0]) if c == '\t']) print('\n'.join(line_dest)) print(line_dest[0].split('\t')) raise # finally, write the (buffered) subdoc split for buff_line in buff_orig: writer_res.writerow(buff_line) # write extra turns from _dest # DEBUG # FIXME csv reader (doublequote=True) fails to split fields on # if one field contains a doubled double quote (supposedly read # as one double-quote) try: parse_turn_id(line_orig[0]) except ValueError: print('line_orig[0]', line_orig[0]) print('Positions of \\t in line_orig[0]', [i for i, c in enumerate(line_orig[0]) if c == '\t']) print('\n'.join(line_orig)) print(line_orig[0].split('\t')) raise # end DEBUG if parse_turn_id(line_orig[0]) < parse_turn_id(line_dest[0]): err_msg = 'Weird state that should never be reached: {}\t{}' raise ValueError(err_msg.format(line_orig, line_dest)) # new turns in _dest: write as they are while parse_turn_id(line_orig[0]) > parse_turn_id(line_dest[0]): writer_res.writerow(line_dest) line_dest = reader_dest.next() if parse_turn_id(line_orig[0]) != parse_turn_id(line_dest[0]): err_msg = 'Weird state that should never be reached: {}\t{}' raise ValueError(err_msg.format(line_orig, line_dest)) # matching turns: transfer line from _orig, with manually segmented # text # optional warnings for differing texts if verbose: txt_orig = line_orig[5] txt_dest = line_dest[5] raw_text_orig = ''.join(txt_orig.split('&')) raw_text_dest = ''.join(txt_dest.split('&')) if raw_text_orig != raw_text_dest: err_msg = [ "W: texts differ at turn {}".format(line_orig[0]), "< " + txt_orig, "> " + txt_dest, ] print('\n'.join(err_msg), file=sys.stderr) # finally, write the line writer_res.writerow(line_orig)
def _backport_turn_text(f_orig, f_dest, f_res, verbose=0): """Backport turn text from `f_orig` to `f_dest` to produce `f_res`. Segment delimiters are stripped from the turn text in `f_orig`. Parameters ---------- f_orig : File Version of the file with higher priority. f_dest : File Version of the file with lower priority. f_res : File `f_dest` with turns transfered from `f_orig`. verbose : int Verbosity level. """ reader_orig = csv.reader(f_orig, delimiter='\t') reader_dest = csv.reader(f_dest, delimiter='\t') writer_res = csv.writer(f_res, delimiter='\t', lineterminator='\n') # read and write header header_orig = reader_orig.next() header_dest = reader_dest.next() writer_res.writerow(header_dest) # read and write content for line_dest in reader_dest: # TODO? handle exhaustion of f_orig (try...except?) line_orig = reader_orig.next() # easy case: keep lines that are identical on both sides if line_orig == line_dest: writer_res.writerow(line_dest) continue # skip additional empty lines from f_orig while (not line_orig or not ''.join(line_orig).strip()): line_orig = reader_orig.next() if parse_turn_id(line_orig[0]) != parse_turn_id(line_dest[0]): err_msg = 'Weird state that should never be reached: {}\t{}' raise ValueError(err_msg.format(line_orig, line_dest)) # matching turns: transfer line from _orig, without '&' txt_orig = line_orig[5] # optional warnings for differing texts if verbose: txt_dest = line_dest[5] raw_text_orig = ''.join(txt_orig.split('&')) raw_text_dest = ''.join(txt_dest.split('&')) if raw_text_orig != raw_text_dest: err_msg = [ "W: texts differ at turn {}".format(line_orig[0]), "< " + txt_orig, "> " + txt_dest, ] print('\n'.join(err_msg), file=sys.stderr) # finally, write the line without '&' new_line_dest = line_orig new_line_dest[5] = txt_orig.replace('&', '') writer_res.writerow(new_line_dest)
def _transfer_turns(f_orig, f_dest, f_res, verbose=0): """Transfer turns from `f_orig` to `f_dest` to produce `f_res`. Parameters ---------- f_orig : File Version of the file with higher priority. f_dest : File Version of the file with lower priority. f_res : File `f_dest` with turns transfered from `f_orig`. verbose : int Verbosity level. """ reader_orig = csv.reader(f_orig, delimiter='\t') reader_dest = csv.reader(f_dest, delimiter='\t') writer_res = csv.writer(f_res, delimiter='\t') # read and write header header_orig = reader_orig.next() header_dest = reader_dest.next() writer_res.writerow(header_dest) # read and write content for line_dest in reader_dest: # _orig exhausted means new turns in _dest try: line_orig = reader_orig.next() except StopIteration: writer_res.writerow(line_dest) continue # easy case: keep lines that are identical on both sides if line_orig == line_dest: writer_res.writerow(line_dest) continue # transfer empty lines, they mark subdoc split # TODO? get rid of spurious empty lines while (not line_orig or not ''.join(line_orig).strip()): writer_res.writerow(line_orig) # transfer split line_orig = reader_orig.next() # DEBUG # FIXME csv reader (doublequote=True) fails to split fields on # if one field contains a doubled double quote (supposedly read # as one double-quote) try: parse_turn_id(line_orig[0]) except ValueError: print([i for i, c in enumerate(line_orig[0]) if c == '\t']) print('\n'.join(line_orig)) print(line_orig[0].split('\t')) raise # end DEBUG if parse_turn_id(line_orig[0]) < parse_turn_id(line_dest[0]): err_msg = 'Weird state that should never be reached: {}\t{}' raise ValueError(err_msg.format(line_orig, line_dest)) # new turns in _dest: write as they are while parse_turn_id(line_orig[0]) > parse_turn_id(line_dest[0]): writer_res.writerow(line_dest) line_dest = reader_dest.next() if parse_turn_id(line_orig[0]) != parse_turn_id(line_dest[0]): err_msg = 'Weird state that should never be reached: {}\t{}' raise ValueError(err_msg.format(line_orig, line_dest)) # matching turns: transfer line from _orig, with manually segmented # text # optional warnings for differing texts if verbose: txt_orig = line_orig[5] txt_dest = line_dest[5] raw_text_orig = ''.join(txt_orig.split('&')) raw_text_dest = ''.join(txt_dest.split('&')) if raw_text_orig != raw_text_dest: err_msg = [ "W: texts differ at turn {}".format(line_orig[0]), "< " + txt_orig, "> " + txt_dest, ] print('\n'.join(err_msg), file=sys.stderr) # finally, write the line writer_res.writerow(line_orig)
def _transfer_turns(f_orig, f_dest, f_res, verbose=0): """Transfer turns from `f_orig` to `f_dest` to produce `f_res`. Parameters ---------- f_orig : File Version of the file with higher priority (typically, the currently annotated version). f_dest : File Version of the file with lower priority (typically, the unannotated but finer-grained version). f_res : File `f_dest` with turns transfered from `f_orig`. verbose : int Verbosity level. """ reader_orig = csv.reader(f_orig, delimiter='\t') reader_dest = csv.reader(f_dest, delimiter='\t') writer_res = csv.writer(f_res, delimiter='\t', lineterminator='\n') # read and write header header_orig = reader_orig.next() header_dest = reader_dest.next() writer_res.writerow(header_dest) # read and write content for line_dest in reader_dest: # _orig exhausted means new turns in _dest try: line_orig = reader_orig.next() except StopIteration: writer_res.writerow(line_dest) continue # easy case: keep lines that are identical on both sides if line_orig == line_dest: writer_res.writerow(line_dest) continue # otherwise: # * empty lines mark subdoc split: # we need to be careful to avoid splitting inside chunks of # game messages buff_orig = [] while (not line_orig or not ''.join(line_orig).strip()): # TODO? get rid of spurious empty lines # look ahead for the next turn in _orig buff_orig.append(line_orig) line_orig = reader_orig.next() # adjust subdoc split: append extra turns from _dest until # either we reach a safe split point # or all extra turns have been consumed if buff_orig: try: # why try/catch: cf. DEBUG below turn_id_orig = parse_turn_id(line_orig[0]) except ValueError: print([i for i, c in enumerate(line_orig[0]) if c == '\t']) print('\n'.join(line_orig)) print(line_orig[0].split('\t')) raise # new turns in _dest should be appended to the current subdoc, # until we reach "It's X's turn to roll the dice." (or none # remains) try: turn_id_dest = parse_turn_id(line_dest[0]) except ValueError: print([i for i, c in enumerate(line_dest[0]) if c == '\t']) print('\n'.join(line_dest)) print(line_dest[0].split('\t')) raise while (turn_id_dest < turn_id_orig and not (line_dest[2] == 'Server' and line_dest[5].endswith('turn to roll the dice.'))): writer_res.writerow(line_dest) # read next turn from _dest line_dest = reader_dest.next() try: turn_id_dest = parse_turn_id(line_dest[0]) except ValueError: print([i for i, c in enumerate(line_dest[0]) if c == '\t']) print('\n'.join(line_dest)) print(line_dest[0].split('\t')) raise # finally, write the (buffered) subdoc split for buff_line in buff_orig: writer_res.writerow(buff_line) # write extra turns from _dest # DEBUG # FIXME csv reader (doublequote=True) fails to split fields on # if one field contains a doubled double quote (supposedly read # as one double-quote) try: parse_turn_id(line_orig[0]) except ValueError: print('line_orig[0]', line_orig[0]) print('Positions of \\t in line_orig[0]', [i for i, c in enumerate(line_orig[0]) if c == '\t']) print('\n'.join(line_orig)) print(line_orig[0].split('\t')) raise # end DEBUG if parse_turn_id(line_orig[0]) < parse_turn_id(line_dest[0]): err_msg = 'Weird state that should never be reached: {}\t{}' raise ValueError(err_msg.format(line_orig, line_dest)) # new turns in _dest: write as they are while parse_turn_id(line_orig[0]) > parse_turn_id(line_dest[0]): writer_res.writerow(line_dest) line_dest = reader_dest.next() if parse_turn_id(line_orig[0]) != parse_turn_id(line_dest[0]): err_msg = 'Weird state that should never be reached: {}\t{}' raise ValueError(err_msg.format(line_orig, line_dest)) # matching turns: transfer line from _orig, with manually segmented # text # optional warnings for differing texts if verbose: txt_orig = line_orig[5] txt_dest = line_dest[5] raw_text_orig = ''.join(txt_orig.split('&')) raw_text_dest = ''.join(txt_dest.split('&')) if raw_text_orig != raw_text_dest: err_msg = [ "W: texts differ at turn {}".format(line_orig[0]), "< " + txt_orig, "> " + txt_dest, ] print('\n'.join(err_msg), file=sys.stderr) # finally, write the line writer_res.writerow(line_orig)