def find_duplicates(self, json_file_path, from_pandas=False): """ Find the row counts for duplicated rows. :param json_file_path: A json file path. It should encode a dictionary with table names as keys. :param from_pandas: boolean. If truthy, then use pandas json readers. See PanDatFactory json readers for more details. :return: A dictionary whose keys are table names for the primary-ed key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the json entry with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates """ _standard_verify(self.tic_dat_factory) if from_pandas: from ticdat import PanDatFactory pdf = PanDatFactory.create_from_full_schema( self.tic_dat_factory.schema(include_ancillary_info=True)) _rtn = pdf.json.create_pan_dat(json_file_path) jdict = { t: [tuple(_) for _ in getattr(_rtn, t).itertuples(index=False)] for t in pdf.all_tables } else: jdict = self._create_jdict(json_file_path) rtn = find_duplicates_from_dict_ticdat(self.tic_dat_factory, jdict) return rtn or {}
def read_lingo_text(tdf,results_text): """ Read Lingo .ldt strings :param tdf: A TicDatFactory defining the schema :param results_text: A list of strings defining Lingo tables :return: A TicDat object consistent with tdf """ for i in results_text.values(): verify(stringish(i), "text needs to be a string") def _get_as_type(val): try: return float(val) except ValueError: return val dict_with_lists = defaultdict(list) for tbn in results_text: rows = [] text = results_text[tbn].strip().split("\n") for line in text: rows.append(list(map(lambda k: _get_as_type(k),line.strip().split()))) dict_with_lists[tbn] = rows assert not find_duplicates_from_dict_ticdat(tdf, dict_with_lists), \ "duplicates were found - if asserts are disabled, duplicate rows will overwrite" return tdf.TicDat(**{k.replace(tdf.lingo_prepend,"",1):v for k,v in dict_with_lists.items()})
def find_duplicates(self, json_file_path): """ Find the row counts for duplicated rows. :param json_file_path: A json file path. It should encode a dictionary with table names as keys. :return: A dictionary whose keys are table names for the primary-ed key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the json entry with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates """ _standard_verify(self.tic_dat_factory) jdict = self._create_jdict(json_file_path) rtn = find_duplicates_from_dict_ticdat(self.tic_dat_factory, jdict) return rtn or {}
def read_opl_text(tdf,text, commaseperator = True): """ Read an OPL .dat string :param tdf: A TicDatFactory defining the schema :param text: A string consistent with the OPL .dat format :return: A TicDat object consistent with tdf """ verify(stringish(text), "text needs to be a string") # probably want to verify something about the ticdat factory, look at the wiki dict_with_lists = defaultdict(list) NONE, TABLE, ROW, ROWSTRING, ROWNUM, FIELD, STRING, NUMBER = 1, 2, 3, 4, 5, 6, 7, 8 mode = NONE field = '' table_name = '' row = [] def to_number(st, pos): try: return float(st) except ValueError: verify(False, "Badly formatted string - Field '%s' is not a valid number. Character position [%s]." % (st, pos)) for i,c in enumerate(text): if mode not in [STRING, ROWSTRING] and (c.isspace() or c == '{' or c == ';'): if mode in [NUMBER, ROWNUM, FIELD] and not commaseperator: c = ',' else: continue if mode in [STRING, ROWSTRING]: if c == '"': if text[i-1] == '\\': field = field[:-1] + '"' else: if mode is ROWSTRING: row.append(field) field = '' verify(len(row) == len((dict_with_lists[table_name] or [row])[0]), "Inconsistent row lengths found for table %s" % table_name) dict_with_lists[table_name].append(row) row = [] mode = TABLE else: mode = FIELD else: field += c elif c == '=': verify(mode is NONE, "Badly formatted string, unrecognized '='. Character position [%s]"%i) verify(len(table_name) > 0, "Badly formatted string, table name can't be blank. Character position [%s]"%i) verify(table_name not in dict_with_lists.keys(), "Can't have duplicate table name. [Character position [%s]"%i) dict_with_lists[table_name] = [] mode = TABLE elif c == '<': verify(mode is TABLE, "Badly formatted string, unrecognized '<'. Character position [%s]"%i) mode = ROW elif c == ',': verify(mode in [ROW, FIELD, NUMBER, ROWNUM, TABLE], "Badly formatted string, unrecognized ','. \ Character position [%s]"%i) if mode is TABLE: continue if mode is ROWNUM: field = to_number(field,i) row.append(field) field = '' verify(len(row) == len((dict_with_lists[table_name] or [row])[0]), "Inconsistent row lengths found for table %s" % table_name) dict_with_lists[table_name].append(row) row = [] mode = TABLE else: if mode is NUMBER: field = to_number(field,i) row.append(field) field = '' mode = ROW elif c == '"': verify(mode in [ROW, TABLE], "Badly formatted string, unrecognized '\"'. Character position [%s]"%i) if mode is ROW: mode = STRING if mode is TABLE: mode = ROWSTRING elif c == '}': verify(mode in [TABLE, ROWNUM], "Badly formatted string, unrecognized '}'. Character position [%s]"%i) if mode is ROWNUM: field = to_number(field,i) row.append(field) field = '' verify(len(row) == len((dict_with_lists[table_name] or [row])[0]), "Inconsistent row lengths found for table %s" % table_name) dict_with_lists[table_name].append(row) row = [] table_name = '' mode = NONE elif c == '>': verify(mode in [ROW, FIELD, NUMBER], "Badly formatted string, unrecognized '>'. \ Character position [%s]"%i) if mode is NUMBER: field = to_number(field,i) mode = FIELD if mode is FIELD: row.append(field) field = '' verify(len(row) == len((dict_with_lists[table_name] or [row])[0]), "Inconsistent row lengths found for table %s"%table_name) dict_with_lists[table_name].append(row) row = [] mode = TABLE else: verify(mode in [NONE, ROW, ROWNUM, FIELD, NUMBER], "Badly formatted string, \ unrecognized '%s'. Character position [%s]"%(c,i)) if mode is NONE: table_name += c elif mode is TABLE: mode = ROWNUM field += c else: mode = NUMBER field += c assert not find_duplicates_from_dict_ticdat(tdf, dict_with_lists), \ "duplicates were found - if asserts are disabled, duplicate rows will overwrite" return tdf.TicDat(**{k.replace(tdf.opl_prepend,"",1):v for k,v in dict_with_lists.items()})