def __init__(self, row, transcript_metadata): """ Arguments: row (list) -- a row from one of the corpus CSV files transcript_metadata (dict) -- a Metadata value based on the current conversation_no """ ################################################## # Utterance data: for i in xrange(len(Utterance.header)): att_name = Utterance.header[i] row_value = None if i < len(row): row_value = row[i].strip() # Special handling of non-string values. if att_name == "trees": # if row_value: row_value = map(Tree, row_value.split("|||")) if row_value: row_value = Tree(None, row_value.split("|||")) else: row_value = [] elif att_name == "ptb_treenumbers": if row_value: row_value = map(int, row_value.split("|||")) else: row_value = [] elif att_name == 'act_tag': # I thought these conjoined tags were meant to be split. # The docs suggest that they are single tags, thought, # so skip this conditional and let it be treated as a str. # row_value = re.split(r"\s*[,;]\s*", row_value) # `` Transcription errors (typos, obvious mistranscriptions) are marked with a "*" after the discourse tag.'' # These are removed for this version. row_value = row_value.replace("*", "") elif att_name in ('conversation_no', 'transcript_index', 'utterance_index', 'subutterance_index'): row_value = int(row_value) # Add the attribute. setattr(self, att_name, row_value) ################################################## # Caller data: for key in ('caller_sex', 'caller_education', 'caller_birth_year', 'caller_dialect_area'): full_key = 'from_' + key if self.caller.endswith("B"): full_key = 'to_' + key setattr(self, key, transcript_metadata[full_key])