def add(table): """ Asks user for input and adds it into the table. Args: table: table to add new record to Returns: Table with a new record """ labels = ['Title', 'manufacturer', 'price', 'in_stock'] user_inp = common.check_user_inp(labels) user_input = ui.get_inputs(['Title', 'manufacturer', 'price', 'in_stock'], "Please provide information") while common.is_number(user_input[2]) is False or common.is_number( user_input[3]) is False: ui.print_error_message('Error: Price and Stock value must be numbers') user_input = ui.get_inputs( ['Title', 'manufacturer', 'price', 'in_stock'], "Please provide information") continue new_id = common.generate_random(table) new_record = [new_id] + user_inp table += [new_record] data_manager.write_table_to_file('store/games.csv', table) return table
def getYearCats(catList, article): ''' Analyses the sv.Wikipedia categories in an article to isolate information related to birth/death :param catList: list of categories :param article: article being worked on :returns: (birth_year, death_year) ''' birth = None death = None if not catList: print 'no category for "%s" or page did not exist' % article else: for c in catList: if c.lower().startswith(u'kategori:avlidna'): if common.is_number(c.strip()[-4:]): death = int(c.strip()[-4:]) else: print u'odd year for %s: %s' % (article, c) elif c.lower().startswith(u'kategori:födda'): if common.is_number(c.strip()[-4:]): birth = int(c.strip()[-4:]) else: print u'odd year for %s: %s' % (article, c) return (birth, death)
def add(table): """ Asks user for input and adds it into the table. Args: table: table to add new record to Returns: Table with a new record """ user_input = ui.get_inputs(['month', 'day', 'year', 'type', ' amount'], "Please provide information") while common.is_number(user_input[2]) is False or common.is_number( user_input[3]) is False: ui.print_error_message('Error: Price and Stock value must be numbers') user_input = ui.get_inputs(['month', 'day', 'year', 'type', 'amount'], "Please provide information") continue new_id = common.generate_random(table) new_record = [new_id] + user_input table += [new_record] data_manager.write_table_to_file('accounting/items.csv', table) return table # your code '''user_input = ui.get_inputs(['month', 'day', 'year', 'type', 'amount'], "Please provide your personal information")
def value_checker(title): """Checks Values for Specials Titles""" keep_checking = True while keep_checking: value = ui.get_inputs([title], '') if title == "Price": if common.is_number(value[0]): return value elif title == "Day": if common.is_number(value[0]): number = int(value[0]) if number > 0 and number < 32: return value elif title == "Month": if common.is_number(value[0]): number = int(value[0]) if number > 0 and number <= 12: return value elif title == "Year": if common.is_number(value[0]): number = int(value[0]) if number > 1990 and number <= 2100: return value else: return value
def add(table): """ Asks user for input and adds it into the table. Args: table: table to add new record to Returns: Table with a new record """ # your code '''user_input = ui.get_inputs(['name', 'manufacturer', 'purchase_date', 'durability'],"Please provide your personal information") new_id = common.generate_random(table) new_record = [new_id] + user_input table += [new_record] data_manager.write_table_to_file('inventory/inventory.csv', table) return table''' user_input = ui.get_inputs(['name', 'manufacturer', 'purchase_date', 'durability'],"Please provide information") while common.is_number(user_input[2]) is False or common.is_number(user_input[3]) is False: ui.print_error_message('Error: Price and Stock value must be numbers') user_input = ui.get_inputs(['Title', 'manufacturer', 'purchase_date', 'durability'],"Please provide information") continue new_id = common.generate_random(table) new_record = [new_id] + user_input table += [new_record] data_manager.write_table_to_file('inventory/inventory.csv', table) return table
def create_valid(cls, expression): """Verifies that the expression is correct, and if so builds an expression from it""" parts = expression.split(' ') if len(parts) != 3: return True if not common.is_number(parts[0]) or not common.is_number(parts[2]): return True if not cls.__is_valid_operator(parts[1]): return False return cls(expression)
def _check_origin(self): for i in range(3): d = self.origin[i] if not common.is_number(d): self.error('origin[%s] must be number but got %s'%(i,type(d))) return 1 return 0
def append(self, token): if self.contents: prev = self.contents[-1] # the minus sign implies a * -1 when used by itself if isinstance(prev, tokens.Minus): # TODO: fix this the rest of the way if len(self.contents) == 1: self.contents.pop() self.contents += [tokens.Value(-1), tokens.Mult()] # absorb: tokens can absorb the next token from the expression if it matches a list of types elif isinstance(token, prev.absorbs): if isinstance(token, Base): token = token.flatten() prev.absorb(token) return # implied multiplication elif prev.priority == token.priority == tokens.Pri.NONE: # negative numbers actually have implied addition if isinstance(token, tokens.Value)\ and is_number(token.value) and int(token.value) < 0: self.contents.append(tokens.Plus()) else: self.contents.append(tokens.Mult()) self.raw.append(token) self.contents.append(token)
def ui_remove(apartments, operations, *args): ''' Handles the remove command, calling special functions for each type of argument. Input - the list of apartments, the list of all operations and the arguments Output - error messages if needed ''' if len(args) == 3: startApartment = args[0] endApartment = args[2] if validate_remove_range(startApartment, endApartment) == True: try: remove_range_apartment(apartments, operations, int(startApartment), int(endApartment)) except Exception: print("There are no apartments within the mentioned criteria.") else: if len(args) == 1: if is_number(args[0]): apartmentNumber = int(args[0]) if is_a_valid_apartment_number(apartmentNumber): try: remove_apartment(apartments, operations, apartmentNumber) except Exception: print( "There is no apartment with the mentioned number.") else: transactionType = args[0] if is_a_valid_transaction(transactionType): try: remove_type(apartments, operations, args[0]) except Exception: print("There is no transaction of that type.") else: print("Incorrect command.")
def save_data(self, data_source: dict): """ Save new data to the DB, if it doesn't already exist (i.e. No duplicate data) :param data_source: :return: none """ try: print('Database is updating...') new_list = [] for k, v in data_source.items(): new_row = [k] for nest_key, nest_value in v.items(): # If it's 'M', insert a '' into database. if is_number(nest_value): new_row.append(nest_value) else: new_row.append('') new_list.append(tuple(new_row)) with DBOperations(self.db_name) as cursor: sql_save_data = """INSERT OR IGNORE INTO samples (sample_date,max_temp,min_temp,avg_temp) VALUES (?,?,?, ?); """ for list_item in new_list: cursor.execute(sql_save_data, list_item) print('Database updated.') except Exception as e: self.logger.error(e)
def run(self): self.network() self.init_reader() use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) init_model_path = config.get("runner.model_save_path") init_model_path = os.path.join(config["config_abs_dir"], init_model_path) logger.info("init_model_path: {}".format(init_model_path)) for file in os.listdir(init_model_path): file_path = os.path.join(init_model_path, file) # hard code for epoch model folder if os.path.isdir(file_path) and is_number(file): self.epoch_model_path_list.append(file_path) if len(self.epoch_model_path_list) == 0: self.epoch_model_path_list.append(init_model_path) self.epoch_model_path_list.sort() logger.info("self.epoch_model_path_list: {}".format( self.epoch_model_path_list)) for idx, model_path in enumerate(self.epoch_model_path_list): logger.info("Begin Infer Model {}".format( self.epoch_model_path_list[idx])) model_name = model_path.split("/")[-1] infer_res = self.run_infer(model_path, model_name) self.infer_result_dict["result"][model_name] = infer_res self.record_result() logger.info("Run Success, Exit.")
def index_document_pipe( pipe: Pipeline, cfg: CollectionConfig, doc: Doc ): """Push a document into the index""" # doc_id = doc[ col.id_fld ] doc_id = x_id(doc, cfg.id_fld) pipe.hset( f'{cfg.name}/docs', doc_id, json.dumps(doc) ) for fld in cfg.text_flds: if fld in doc: text = doc[fld] index_text( pipe, cfg, doc_id, text) for fld in cfg.facet_flds: if fld not in doc: continue for val in as_list( doc, fld ): assert is_scalar(val), f"Found non scalar value ({val}) in field '{fld}' of " \ f"document with id {doc_id}" index_facet( pipe, cfg.name, doc_id, fld, val ) for fld in cfg.number_flds: if fld not in doc: continue for val in as_list(doc, fld): if val is None: continue assert is_number(val), f"Found non numeric value ({val}) in field '{fld}' of " \ f"document with id {doc_id}" index_numeric(pipe, cfg.name, doc_id, fld, val)
def run(self): self.network() self.init_reader() use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) init_model_path = config.get("runner.model_save_path") for file in os.listdir(init_model_path): file_path = os.path.join(init_model_path, file) # hard code for epoch model folder if os.path.isdir(file_path) and is_number(file): self.epoch_model_path_list.append(file_path) self.epoch_model_name_list.append(file) if len(self.epoch_model_path_list) == 0: self.epoch_model_path_list.append(init_model_path) self.epoch_model_name_list.append(init_model_path) self.epoch_model_path_list.sort() self.epoch_model_name_list.sort() for idx, model_path in enumerate(self.epoch_model_path_list): logger.info("Begin Infer Model {}".format( self.epoch_model_name_list[idx])) self.run_infer(model_path, self.epoch_model_name_list[idx]) logger.info("Run Success, Exit.")
def check_substitutions(subs): '''Subs: UFL terminals/variable -> sympy expressions of right type''' if not all(is_terminal(k) or isinstance(k, Variable) for k in subs.keys()): return False # If the form is defined in terms of vars as well as terminals we inject # unwrapped variables subs.update({ k.ufl_operands[0]: v for k, v in subs.items() if isinstance(k, Variable) }) check_scalar = lambda k, v: k.ufl_shape == () and (is_scalar(v) or is_number(v)) check_vector = lambda k, v: ( (len(k.ufl_shape) == 1 and is_vector(v)) and (k.ufl_shape[0] in (v.rows, v.cols))) check_matrix = lambda k, v: len(k.ufl_shape) == 2 and k.ufl_shape == ( v.rows, v.cols) check = lambda p: check_scalar(*p) or check_vector(*p) or check_matrix(*p) return all(map(check, subs.items()))
def get_node(self, node): if common.is_number(node): return self.nodes[int(node)] for n in self.nodes: if n.name == node: return n
def ValidateData(self): if not self.txt_emp_id.GetValue().strip(): self.txt_emp_id.SetFocus() return "Employee ID is required" elif not self.txt_name.GetValue().strip(): self.txt_name.SetFocus() return "Employee Name is required" elif not self.cbo_designation.GetValue().strip(): self.cbo_designation.SetFocus() return "Designation is required" elif not self.cbo_posting.GetValue().strip(): self.cbo_posting.SetFocus() return "Posting Place is required" elif not self.txt_incre_amt.GetValue().strip(): self.txt_incre_amt.SetFocus() return "Increment is required" elif not self.dpc_incre_date.GetValue(): self.dpc_incre_date.SetFocus() return "Increment Date is required" elif not self.txt_present_basic.GetValue().strip(): self.txt_present_basic.SetFocus() return "Present Basic is required" elif not self.dpc_print_date.GetValue(): self.dpc_print_date.SetFocus() return "Print date is required" elif self.cbo_designation.GetValue() not in self.designation.values(): self.cbo_designation.SetValue("") self.cbo_designation.SetFocus() return "Designation is not correct" elif self.cbo_posting.GetValue() not in self.posting.values(): self.cbo_posting.SetValue("") self.cbo_posting.SetFocus() return "Posting is not Correct" elif not is_number(self.txt_emp_id.GetValue().strip()): self.txt_emp_id.SetValue("") self.txt_emp_id.SetFocus() return "Employee ID is not Numeric" elif not is_number(self.txt_incre_amt.GetValue().strip()): self.txt_incre_amt.SetValue("") self.txt_incre_amt.SetFocus() return "Increment is not Numeric" elif not is_number(self.txt_present_basic.GetValue().strip()): self.txt_present_basic.SetValue("") self.txt_present_basic.SetFocus() return "Present Basic is not Numeric" else: return "valid"
def is_a_valid_apartment_number(apartmentNumber): ''' Check if a string represents a valid apartment number. Input - a string Output - True if the string is a apartment number, false otherwise ''' if is_number(apartmentNumber) and int(apartmentNumber) > 0: return True print("Apartment number not valid.") return False
def _check_spacing(self): for i in range(3): d = self.spacing[i] if not common.is_number(d): self.error('spacing[%s] must be number but got %s'%(i,type(d))) return 1 if d<=0: self.error('spacing[%s] must be positive number but got %s'%(i,d)) return 1 return 0
def is_a_valid_expense(amount): ''' Check if a string represents a valid expense amount. Input - a string Output - True if the string is a natural number, false otherwise ''' if is_number(amount) and int(amount) > 0: return True print("Expense value not valid.") return False
def build_marking(self, marking_string): marking = [0] * len(self.nodes) if common.is_number(marking_string): for i in range(0, len(self.nodes)): marking[i] = int(marking_string[i]) else: nds = marking_string.split(",") for i in range(0, len(nds)): val = nds[i].split("=") node = self.get_node(val[0].strip()) marking[node.id] = int(val[1]) return marking
def analyseDates(self, output=True): wlm_date = (int(self.settingDate[:4]), int(self.settingDate[5:])) #Spcial bins current = u'current (%s)' % self.settingDate since_last = u'since_last (%d-%s – %s)' % ( wlm_date[0] - 1, str(wlm_date[1]).zfill(2), current) rest_of_last_year = u'rest_of_last_year (%d – %d-%s)' % ( wlm_date[0] - 1, wlm_date[0] - 1, str(wlm_date[1]).zfill(2)) results = {current: 0, since_last: 0, rest_of_last_year: 0} blanks = 0 for k, v in self.indata.iteritems(): date_raw = v['created'] #skip any entries without valid monument_id or value if date_raw == '' or len(date_raw) < 4: blanks += 1 continue #prepare dates month = 0 if not common.is_number(date_raw[:4]): date = ('text', 'text') else: if len(date_raw) >= 7: month = int(date_raw[5:7]) date = (int(date_raw[:4]), month) #binning if date == wlm_date: #the current competition results[current] += 1 elif (date[0] == wlm_date[0] and date[1] < wlm_date[1]) or (date[0] == wlm_date[0] - 1 and date[1] > wlm_date[1]): #since last competition results[since_last] += 1 elif date[0] == wlm_date[0] - 1: #the rest of that year results[rest_of_last_year] += 1 else: if not str(date[0]) in results.keys(): results[str(date[0])] = 1 else: results[str(date[0])] += 1 if output: #to simple to be outputSimple() f = codecs.open(u'%s_dates.csv' % self.output, 'w', 'utf-8') f.write('#no. dates: %d\n' % len(results)) f.write('#no. blanks: %d\n' % blanks) f.write('#dates|no. images\n') for k, v in results.iteritems(): f.write('%s|%d\n' % (k, v)) f.close()
def match_name(self): pos = self._scanner parts = [] while not self._scanner.eof() and self.is_ident(self.peek()): if len(parts) > 0 and parts[-1] == '_' and self.peek() == '_': self.fail('consecutive-underscores') parts.append(self.peek()) self.next() name = ''.join(parts) if common.is_number(name): return token.Token(token.NUM, int(name), position=pos) else: type = KEYWORDS.get(name, token.ID) return token.Token(type, name, position=pos)
def expr_body(expr, coordnames=DEFAULT_NAMES, **kwargs): '''Generate a/list of string/s that is the Cpp code for the expression''' if is_number(expr): return expr_body(sp.S(expr), **kwargs) if isinstance(expr, sp.Expr) and is_scalar(expr): # Defined in terms of some coordinates xyz = set(coordnames) xyz_used = xyz & expr.free_symbols assert xyz_used <= xyz # Recognize the constant if not expr.free_symbols: # Flag that we can be constant return str(expr), kwargs, True # Expression params which need default values params = (expr.free_symbols - xyz_used) # Substitute for x[0], x[1], ... expr = expr.subs( {x: sp.Symbol('x[%d]' % i) for i, x in enumerate(coordnames)}, simultaneous=True) # Body expr = ccode(expr).replace('M_PI', 'pi') # Default to zero kwargs.update(dict((str(p), kwargs.get(str(p), 0)) for p in params)) # Convert return expr, kwargs, False # Tensors that sympy can represent as lists # (1, n) to (n, 1) to list of n if is_vector(expr): expr = sum(expr.tolist(), []) elif is_matrix(expr): expr = expr.tolist() # Other lists # FIXME: Can this be implemented without returning kwargs, i.e. the # scalar place would modify it's arguments. For now I don't see how # https://stackoverflow.com/questions/45883655/is-it-always-safe-to-modify-the-kwargs-dictionary kwargs_ = kwargs is_constant_expr = True ans = () for e in expr: f, kwargs_, is_constant = expr_body(e, **kwargs_) is_constant_expr = is_constant_expr and is_constant ans = ans + (f, ) return ans, kwargs_, is_constant_expr
def generate_box_plot(self, start_year: int, end_year: int) -> dict: """ Generate a box plot by years data. :param end_year: starting year for box plotting :param start_year: ending year for line plotting :return: returns the generated box plot images' saving paths class instance """ try: print('Generate a BOX PLOT between years[{0}-{1}]...'.format( start_year, end_year)) my_db = DBOperations('weather.sqlite') years_data_list = [] for current_year in range(start_year, end_year + 1): years_data_list.extend(my_db.fetch_data(current_year)) monthly_weather_data = { } # format: [1:[Jan temps],2:[Feb temps],..,12:[Dec temps]] for month in range(1, 13): if month not in monthly_weather_data: monthly_weather_data[month] = [] for item in years_data_list: if is_number(item[5]): monthly_weather_data[int(item[1][5:7])].append( float(item[5])) plot_title = 'Monthly Temperature Distribution for: ' + str( start_year) + ' to ' + str(end_year) plt.boxplot(monthly_weather_data.values(), sym="o", whis=1.5) plt.xlabel('Month') plt.ylabel('Temperature (Celsius)') plt.title(plot_title) file_name = str(start_year) + '_to_' + str(end_year) + '.png' # Create new directory output_dir = "images" mkdir_p(output_dir) file_path = '{0}/{1}'.format(output_dir, file_name) self.box_plot_path_saving_dict[str(start_year) + '-' + str(end_year)] = file_path plt.savefig(file_path) plt.show() return self.box_plot_path_saving_dict except Exception as e: self.logger.error(e)
def analyseDates(self, output=True): wlm_date = (int(self.settingDate[:4]), int(self.settingDate[5:])) #Spcial bins current = u'current (%s)' %self.settingDate since_last = u'since_last (%d-%s – %s)' %(wlm_date[0]-1, str(wlm_date[1]).zfill(2), current) rest_of_last_year = u'rest_of_last_year (%d – %d-%s)' %(wlm_date[0]-1, wlm_date[0]-1,str(wlm_date[1]).zfill(2)) results = {current:0, since_last:0, rest_of_last_year:0} blanks = 0 for k,v in self.indata.iteritems(): date_raw = v['created'] #skip any entries without valid monument_id or value if date_raw == '' or len(date_raw)<4: blanks +=1 continue #prepare dates month = 0 if not common.is_number(date_raw[:4]): date = ('text','text') else: if len(date_raw) >= 7: month = int(date_raw[5:7]) date = (int(date_raw[:4]),month) #binning if date == wlm_date: #the current competition results[current] += 1 elif (date[0] == wlm_date[0] and date[1] < wlm_date[1]) or (date[0] == wlm_date[0]-1 and date[1] > wlm_date[1]): #since last competition results[since_last] += 1 elif date[0] == wlm_date[0]-1: #the rest of that year results[rest_of_last_year] += 1 else: if not str(date[0]) in results.keys(): results[str(date[0])] = 1 else: results[str(date[0])] += 1 if output: #to simple to be outputSimple() f = codecs.open(u'%s_dates.csv' %self.output, 'w', 'utf-8') f.write('#no. dates: %d\n' %len(results)) f.write('#no. blanks: %d\n' %blanks) f.write('#dates|no. images\n') for k, v in results.iteritems(): f.write('%s|%d\n' %(k, v)) f.close()
def main(): counts = defaultdict(int) for line in sys.stdin: review = json.loads(line) if 'rating' not in review: continue rating = review['rating'] if not is_number(rating): continue counts[rating] += 1 print '"rating","count"' for rating, count in counts.iteritems(): print '"{0}","{1}"'.format(rating, count)
def generate_line_plot(self, specific_year: int, specific_month: int) -> dict: """ Generate a line plot by month data. :param specific_month: the chosen month for line plotting :param specific_year: the chosen year for line plotting :return: returns the generated line plot images' saving paths class instance """ try: print('Generate a Line PLOT for [{0}-{1}]...'.format( specific_year, specific_month)) month_string_list = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ] my_db = DBOperations('weather.sqlite') specific_timestamp = [] # 2020-12-01 specific_month_data = [] month_data = my_db.fetch_data(specific_year, specific_month) for item in month_data: if is_number(item[5]): specific_timestamp.append(float(item[1][-2:])) specific_month_data.append(float(item[5])) plt.plot(specific_timestamp, specific_month_data) plt.xlabel('Day') plt.ylabel('Temperature (Celsius)') plot_title = 'Daily Temperature Distribution for: ' + month_string_list[ specific_month - 1] + ' ' + str(specific_year) plt.title(plot_title) file_name = str(specific_year) + '-' + str(specific_month) + '.png' # Create new directory output_dir = "images" mkdir_p(output_dir) file_path = '{0}/{1}'.format(output_dir, file_name) self.line_plot_path_saving_dict[str(specific_year) + '-' + str(specific_month)] = file_path plt.savefig(file_path) plt.show() return self.line_plot_path_saving_dict except Exception as e: self.logger.error(e)
def ui_filter(apartments, operations, *args): ''' Handles the filter command. Input - the list of apartments, the list of operation and the argument as a string Output - an error message if needed or calls the valid sub-func ''' if len(args) == 1: oldApartments = deepcopy(apartments) operations.append(('filter', oldApartments)) if is_number(args[0]): if is_a_valid_expense(args[0]): filter_value(apartments, int(args[0])) else: if is_a_valid_transaction(args[0]): filter_type(apartments, args[0]) else: print("Incorrect command.")
def add(table): """ Asks user for input and adds it into the table. Args: table: table to add new record to Returns: Table with a new record """ user_input = ui.get_inputs(['Title', 'manufacturer', 'price', 'in_stock'],"Please provide information") while common.is_number(user_input[2]) is false: continue new_id = common.generate_random(table) new_record = [new_id] + user_input table += [new_record] data_manager.write_table_to_file('store/games.csv', table) return table
def number(self, dot=True, test=False, inc=True): num = '' first = True pos = self.pos while self.more(pos): char = self.source[pos] if char == '-' and first: pass elif not char.isdigit(): break first = False num += char pos += 1 if char == '.' and dot: num += '.' pos += 1 self.pos, tmp = pos, self.pos try: num += str(self.number(dot=False)) except ParseError: pass pos, self.pos = self.pos, tmp if inc and not test: self.pos = pos if is_number(num): if test: return True try: n = int(num) except ValueError: n = float(num) return n else: if test: return False lines = self.source[:pos] line = lines.count('\n') + 1 col = max(self.pos - lines.rfind('\n'), 0) raise ParseError('invalid number ending at {}:{}: {}'.format( line, col, num))
def number(self, dot=True, test=False, inc=True): num = '' first = True pos = self.pos while self.more(pos): char = self.source[pos] if char == '-' and first: pass elif not char.isdigit(): break first = False num += char pos += 1 if char == '.' and dot: num += '.' pos += 1 self.pos, tmp = pos, self.pos try: num += str(self.number(dot=False)) except ParseError: pass pos, self.pos = self.pos, tmp if inc and not test: self.pos = pos if is_number(num): if test: return True try: n = int(num) except ValueError: n = float(num) return n else: if test: return False lines = self.source[:pos] line = lines.count('\n') + 1 col = max(self.pos - lines.rfind('\n'), 0) raise ParseError('invalid number ending at {}:{}: {}'.format(line, col, num))
def perform_semantic_inference(cluster_collection): """ This function performs semantic inference on a list of clusters given For each message in these clusters semantics are inferred by analyzing the token resp. its context. At the moment only two semantics are automatically inferred: numeric and IPv4 address TODO: Add more semantics, e.g. EOL identifier, lenght fields, ... """ # Try to perform semantic inferences # Walk through every cluster and check messages for obvious results cluster = cluster_collection.get_all_cluster() for c in cluster: messages = c.get_messages() for message in messages: tokenlist = message.get_tokenlist() iterator = peekable(tokenlist) idx = 0 while not iterator.isLast(): # for tokenRepresentation in tokenlist: tokenRepresentation = iterator.next() # TODO: do we need to keep semantics which involve multiple cluster? e.g. sessionids? previous_semantics = tokenRepresentation.get_semantics() tokenRepresentation.set_semantics([]) # Clear existing semantics from previous run # for s in previous_semantics: # if s.startswith("sessionid"): # tokenRepresentation.add_semantic(s) # break if "sessionid" in previous_semantics: # Check if we have at least 2 messages and we are not of type Const if len(messages) > 1 and c.get_format(idx) != Message.typeConst: tokenRepresentation.add_semantic("sessionid") if "FD" in previous_semantics: tokenRepresentation.add_semantic("FD") token = tokenRepresentation.get_token() # Check whether it is numeric try: isNumber = tokenRepresentation.get_tokenType() == Message.typeText and common.is_number(token) except TypeError: if Globals.getConfig().debug: print "Error checking token {0} for number semantics".format(token) isNumber = False if isNumber: tokenRepresentation.add_semantic("numeric") # c.add_semantics(idx,"numeric") # print "Inferred semantic inference 'numeric' for token ", token # Check whether it is an IP address if isinstance(token, str) and common.is_ipv4(token): tokenRepresentation.add_semantic("ipv4 address") # Do not add to cluster unless it is valid for all c.add_semantics(idx,"ipv4 address") # print "Inferred semantic inference 'ipv4 address' for token ", token # Check for carriage return identifiers # When 0d is followed by 0a we've got a CR-LF # Sensible? When 0d or 0a is the last token, we've got a single CR resp LF # In all other cases assume 0d/0a is just a hex value of the protocol if token == 0xD: nextOne = iterator.peek() if isinstance(nextOne, TokenRepresentation): if nextOne.get_token() == 0xA: inferred_formats = c.get_format_inference() if ( inferred_formats[idx].getType() == Message.typeConst and inferred_formats[idx + 1].getType() == Message.typeConst ): tokenRepresentation.add_semantic("CR") # c.add_semantics(idx,"CR") nextOne = iterator.next() nextOne.set_semantics(["LF"]) # c.add_semantics(idx+1, "LF") idx += 1 idx += 1 # Perform other tests like "is length field?" # explicitely iterate through all messages like stated in the paper # we could also postpone this to the call of 'pushToClusterSeminatics" but.. reference_message = messages[0] tokenlist = reference_message.get_tokenlist() idx = 0 for tokenRepresentation in tokenlist: if tokenRepresentation.get_tokenType() == Message.typeBinary and idx + 1 < len(tokenlist): ref_value = tokenRepresentation.get_token() if ( not tokenlist[idx + 1].get_tokenType() == Message.typeText ): # We require that the next token is the text token in question idx += 1 continue ref_next_length = tokenlist[idx + 1].get_length() if not ref_value == ref_next_length: # This is no length field idx += 1 continue ref_message_length = reference_message.get_length() is_length = True for message in messages: cmp_value = message.get_tokenlist()[idx].get_token() cmp_next_length = message.get_tokenlist()[idx + 1].get_length() cmp_message_length = message.get_length() try: diff_val = abs(cmp_value - ref_value) except TypeError: # Could happen if a short text token is mistaken as a binary value break diff_next_length = abs(cmp_next_length - ref_next_length) # The next line also takes total msg length differences into account. This might not be true for # all protocols diff_msg_length = abs(cmp_message_length - ref_message_length) if Globals.getConfig().requireTotalLengthChangeForLengthField: if not (diff_val == diff_next_length == diff_msg_length): is_length = False break else: if not (diff_val == diff_next_length): is_length = False break if is_length: # set "lengthfield" semantic for every message in the cluster at the given position for message in messages: # TODO: What if there's only one message in the cluster? Sensible? message.get_tokenlist()[idx].add_semantic("lengthfield") c.add_semantic_for_token(idx, "lengthfield") idx += 1 # Try to identify sessionid fields reference_message = messages[0] nextInFlow = reference_message.getNextInFlow() if nextInFlow != None and not ( len(messages) == 1 and Globals.getConfig().sessionIDOnlyWithClustersWithMoreThanOneMessage ): tokenlist = reference_message.get_tokenlist() next_tokenlist = nextInFlow.get_tokenlist() ref_idx = 0 for tokenRepresentation in tokenlist: tokType = tokenRepresentation.get_tokenType() # If its not a binary, it cannot be a cookie if tokType != Message.typeBinary: ref_idx += 1 continue fmt = c.get_format(ref_idx) # If its a binary but const, it cannot be a cookie if fmt[1] == Message.typeConst: ref_idx += 1 continue # Set reference value ref_val = tokenRepresentation.get_token() # Walk next flow for reference value next_idx = 0 for next_tokenRepresentation in next_tokenlist: # Retrieve next token type nextTokType = next_tokenRepresentation.get_tokenType() # If it is not a binary we don't see it as a cookie if Globals.getConfig().sessionIDOnlyWithBinary: if nextTokType != Message.typeBinary: next_idx += 1 continue next_cluster = nextInFlow.getCluster() # Get format of comparating message comp_fmt = next_cluster.get_format(next_idx) # If it is const, it cannot be a sessonid if comp_fmt[1] == Message.typeConst: next_idx += 1 continue # Load comparator value comp_val = next_tokenRepresentation.get_token() if ( ref_val == comp_val ): # We've got a potential hit, now compare all messages for the same idx pairs isCookie = True for cmp_ref_msg in messages: if not isCookie: break if cmp_ref_msg == messages[0]: # Skip first message (we've already checked that one continue cmp_ref_tok_list = cmp_ref_msg.get_tokenlist() cmp_ref_val = cmp_ref_tok_list[ref_idx].get_token() cmp_cmp_msg = cmp_ref_msg.getNextInFlow() if cmp_cmp_msg == None: isCookie = False else: cmp_cmp_tok_list = cmp_cmp_msg.get_tokenlist() if next_idx >= len(cmp_cmp_tok_list): # Obviously "next" points to messages in different clusters # so the len might differ from the reference next cluster # used to find our reference cookie value # Therefore this cannot be a cookie isCookie = False continue # Make sure the comparing token is also not constant cmp_cmp_fmt = cmp_cmp_msg.getCluster().get_format(next_idx) # If it is const, it cannot be a sessonid if cmp_cmp_fmt == Message.typeConst: isCookie = False continue # Finally compare the values cmp_cmp_val = cmp_cmp_tok_list[next_idx].get_token() if (cmp_ref_val != cmp_cmp_val) or ( (cmp_ref_val == cmp_cmp_val) and (cmp_ref_val == ref_val) ): isCookie = False if isCookie: # Set cookie semantic in this message and the other # sessionid = uuid.uuid1() for message in messages: # Set for every message and the cluster itself # message.get_tokenlist()[ref_idx].add_semantic("sessionid_{0}".format(sessionid)) message.get_tokenlist()[ref_idx].add_semantic("sessionid") nextMsg = message.getNextInFlow() # nextMsg.get_tokenlist()[next_idx].add_semantic("sessionid_{0}".format(sessionid)) nextMsg.get_tokenlist()[next_idx].add_semantic("sessionid") c.add_semantic_for_token(ref_idx, "sessionid") # c.add_semantic_for_token(ref_idx,"sessionid_{0}".format(sessionid)) next_idx += 1 ref_idx += 1 # Try to find random fields (16 bit) token_formats = c.get_formats() idx = 0 for token_format in token_formats: rep, form, semantics = token_format if form.getType() == Message.typeVariable and rep == Message.typeBinary: try: variance = c.getVariableStatistics()[idx].getVariance() except Exception: pass if variance > 1000 and len(semantics) == 0: # We've got a very high variance and no assigned semantics --> candidate for random # Have a look at the last but one token if idx - 1 >= 0: rep, form, semantics = token_formats[idx - 1] if form.getType() == Message.typeVariable and rep == Message.typeBinary: stats = c.getVariableStatistics()[idx - 1] if stats != None: variance2 = stats.getVariance() else: logging.error( "Did not receive cluster statistics for token {0} (len of formats {1}, len of stats {2})".format( idx, len(token_formats), len(c.getVariableStatistics()) ) ) idx += 1 continue if variance2 > 1000 and len(semantics) == 0: # Consider the two as a CRC-16 for message in messages: # Set for every message and the cluster itself message.get_tokenlist()[idx - 1].add_semantic("random") message.get_tokenlist()[idx].add_semantic("random") c.add_semantic_for_token(idx - 1, "random") c.add_semantic_for_token(idx, "random") idx += 1 # Try to find sets (valued limited in variability with lower and upper bound) token_formats = c.get_formats() idx = 0 for token_format in token_formats: rep, form, semantics = token_format if form.getType() == Message.typeVariable: stats = c.getVariableStatistics()[idx] if stats != None: distinct = stats.numberOfDistinctSamples() else: logging.error( "Did not receive cluster statistics for token {0} (len of formats {1}, len of stats {2})".format( idx, len(token_formats), len(c.getVariableStatistics()) ) ) idx += 1 continue # How will be find out whether a number of variable values is a set or really variable? # We assume that there is an absolute maximum amount of distinct values which is independent # of the actual number of messages. However we also need to consider that when the number of # messages in a cluster definitily falls below the setAbsoluteMax value, we have to adapt # the local maximum in this cluster. # For the moment we take a multiplier for the number of messages (default 0.3 == 30%) and # assume it is a set, when both, setAbsoluteMax and the localThreshold is underrun # In addition we assume that we have no semantics for this token, as other semantics conflict # with the notion of a set if ( distinct <= Globals.getConfig().setAbsoluteMax and distinct <= (c.getNumberOfMessages() * Globals.getConfig().setPercentageThreshold) and len(semantics) == 0 ): for message in messages: # Set for every message and the cluster itself message.get_tokenlist()[idx].add_semantic("set") c.add_semantic_for_token(idx - 1, "set") idx += 1 # Push to cluster pushUpToCluster(cluster_collection)
def findMatches(odok, wiki): ''' tries to find matches between scraped items and exisiting odok items identified matches has the odok id added to the wiki object TO DO: Expand to display several alternatives ''' # remove any id's which have already been identified matched_ids = [] for w in wiki: if w['id']: if w['id'] in matched_ids: print u'id %s was matched to more than one wiki object!' % w['id'] else: matched_ids.append(w['id']) print u'%r out of %r already matched (out of a maximum of %r)' % (len(matched_ids), len(wiki), len(odok)) # make lists of odok titles and artists odok_titles = {} odok_artist = {} odok_surname = {} for key, o in odok.iteritems(): if key in matched_ids: continue if o['title']: if o['title'] in odok_titles.keys(): odok_titles[o['title']].append(key) else: odok_titles[o['title']] = [key, ] if o['artist']: if o['artist'] in odok_artist.keys(): odok_artist[o['artist']].append(key) else: odok_artist[o['artist']] = [key, ] surname = wash(o['artist'].split(' ')[-1]) if surname in odok_surname.keys(): odok_surname[surname].append(key) else: odok_surname[surname] = [key, ] # remove any id's which have already been identified for w in wiki: if w['id']: continue wIdN = None wIdA = None wIdS = None match = ([], '') if w['namn'] in odok_titles.keys(): wIdN = odok_titles[w['namn']] if w[u'skulptör'] in odok_artist.keys(): wIdA = odok_artist[w[u'skulptör']] if wash(w[u'skulptör'].split(' ')[-1]) in odok_surname.keys(): wIdS = odok_surname[wash(w[u'skulptör'].split(' ')[-1])] if wIdN and wIdA: # match on both title and artist if len(wIdN) == 1: if wIdN[0] in wIdA: match = ([wIdN[0]], 'double match') else: match = ([wIdN[0]], 'title match but artist missmatch') else: for nId in wIdN: if nId in wIdA: match = ([nId], 'Non-unique title with artist match') break elif wIdN: # match on title only match = (wIdN, 'titel match') elif wIdA: # match on artist only match = (wIdA, 'artist match') elif wIdS: # last ditch attempt matching surname. match = (wIdS, 'surname match') # always check this of no match? # replace do "nice search" with ss->s # explicitly ask for verification for each match if match[0]: keys = match[0] print u'%s: (%s)' % (match[1], ' | '.join(keys)) print u'W: "%s", "%s", (%s), "%s"' % (w[u'namn'], w[u'skulptör'], w[u'årtal'], w['plats']) for r in range(0, len(keys)): key = keys[r] print u'%r: "%s", "%s", (%s), "%s"' % (r, odok[key]['title'], odok[key][u'artist'], odok[key][u'year'], odok[key][u'address']) while True: inChoice = raw_input('Accept? [#/N]:') if inChoice == 'N' or inChoice == 'n': break elif common.is_number(inChoice) and int(inChoice) in range(0, len(keys)): w['id'] = keys[int(inChoice)] break
def updatesToDatabase(odok, wiki, quick=False): ''' given a wiki-entry which has been matched to an odok object this checks whether any of the wikiinfo should be added to the odok object and prepares an update statement. setting quick to true puts any updates requiring decision making into the postponed output ''' wpApi = wikiApi.WikiApi.setUpApi(user=config['w_username'], password=config['w_password'], site=config['wp_site']) updated = {} postponed = {} linked_artists = {} mapping = {u'namn': 'title', u'skulptör': 'artist', u'årtal': 'year', u'material': 'material', u'plats': 'address', u'header': 'district', u'lat': 'lat', u'lon': 'lon', u'bild': 'image', u'typ': 'type'} # non-trivial mappings u'namn_link': 'wiki' for w in wiki: if not w['id']: continue o = odok[w['id']] changes = {} skipped = {} for k, v in mapping.iteritems(): if k not in w.keys(): # for postponed file some fields might be missing continue no_Tags, dummy = common.extractLink(w[k], kill_tags=True) if not no_Tags: # skip if w[k] is empty (or only a tag) continue if (not o[v]) and no_Tags: # trivial case of new info changes[v] = no_Tags elif o[v] and (not o[v].lower() == no_Tags.lower()): if quick: skipped[k] = w[k] else: # need to decide which to use print u'Diff for %s (%s): %s' % (w['id'], o['title'], v) print u' ödok: "%s"' % o[v] print u' wiki: "%s"' % w[k] while True: inChoice = raw_input(u'Use wiki [Y(es)/N(o)/S(kip)]:') if inChoice.lower() == u'n' or inChoice.lower() == u'no': break elif inChoice.lower() == u'y' or inChoice.lower() == u'yes': changes[v] = no_Tags break elif inChoice.lower() == u's' or inChoice.lower() == u'skip': skipped[k] = w[k] break # register any artist_links so that these can be compared to existing links if u'skulptör_link' in w.keys() and w[u'skulptör_link']: # postponed might not have u'skulptör_link' for a in w[u'skulptör_link']: if a in linked_artists.keys(): linked_artists[a].append(w['id']) else: linked_artists[a] = [w['id'], ] # article_links must be checked manually since link may be depictive rather than of the actual object. if (u'namn_link' in w.keys() and w['namn_link']) and not o[u'wiki']: # postponed might not have u'namn_link' keys = w['namn_link'] print u'Potential title link for "%s" ("%s" on wiki)' % (o['title'], w['namn']) for r in range(0, len(keys)): print u'%r: "%s"' % (r, keys[r]) while True: inChoice = raw_input('Accept? [#/N]:') if inChoice == 'N' or inChoice == 'n': break elif common.is_number(inChoice) and int(inChoice) in range(0, len(keys)): # NEW START wdInfo = wpApi.getPageInfo(keys[int(inChoice)], debug=True)[keys[int(inChoice)]] if 'wikidata' in wdInfo.keys() and wdInfo['wikidata']: # if exists and not empty changes[u'wiki'] = wdInfo['wikidata'] break # add changes if changes: updated[w['id']] = changes.copy() if skipped: postponed[w['id']] = skipped.copy() # end of wiki_object loop # Build new wikidata-module - # Build odok_write-module in the same spirit. Moving lots of writeToDatabase to that # om inte header, try page # plats_link? return (updated, postponed, linked_artists)
def compareToDB(wikiObj, odokObj, wpApi, dbReadSQL, verbose=False): ''' compares a listobj to equiv obj in database this needs to deal with links and wikitext this should check clash parameter should return (diff, log) diff: dict of changes (if any) otherwise NONE log: list of issues encountered e.g. incorrecly formated wikitext TODO: proper log for coordinates only care about first X decimals in coordinate return needed/removed links fotnot-name should anything be done with: * odok:u'same_as' * odok:u'year_cmt' ''' # wikiObj.keys() = [u'typ', u'artikel', u'titel', 'clash', u'inomhus', u'material', u'döljStadsdel', u'län', u'konstnär2', # u'konstnär3', u'konstnär4', u'konstnär5', u'konstnär6', u'konstnär7', u'konstnär8', u'konstnär9', # u'döljKommun', u'lat', u'plats', u'fotnot', u'fotnot2', u'fotnot3', u'id', u'kommun', # u'bild', u'stadsdel', u'commonscat', u'fri', u'konstnär', u'lon', u'beskrivning', u'årtal', u'id-länk', # u'fotnot-namn', u'fotnot2-namn', u'fotnot3-namn', u'aka', u'page', u'lista', u'header'] # odokObj.keys() = [u'changed', u'official_url', u'ugc', u'image', u'county', u'year', u'owner', u'commons_cat', u'id', # u'wiki', u'list', u'descr', u'title', u'lon', u'source', u'same_as', u'type', u'muni', u'material', u'free', # u'district', u'address', u'lat', u'year_cmt', u'artist', u'inside', u'created', u'cmt', u'removed'] log = '' if wikiObj['clash']: log += u'clash with another page. Don\'t know how to resolve this. Skipping: %s\n' % wikiObj['clash'] return (None, log) ## Pre-processing # get some more things from ODOK odokObj[u'linked_artists'] = dbReadSQL.findArtist(wikiObj[u'id']) odokObj[u'artist_links'] = [] for a in odokObj[u'linked_artists']: odokObj[u'artist_links'].append(a['wiki']) odokObj[u'aka'] = '' akas = dbReadSQL.findAkas(wikiObj[u'id']) if akas: odokObj[u'aka'] = [] for a in akas: odokObj[u'aka'].append(a['aka']) odokObj[u'aka'] = ';'.join(odokObj[u'aka']) if odokObj[u'wiki']: odokObj[u'wiki'] = odokObj[u'wiki'].upper() # the following is inherited from the header if wikiObj[u'header'][u'tidigare']: wikiObj[u'tidigare'] = 1 else: wikiObj[u'tidigare'] = 0 # the following may be inherited from the header if wikiObj[u'döljKommun']: wikiObj[u'kommun'] = wikiObj[u'header'][u'kommun'] if not wikiObj[u'län']: wikiObj[u'län'] = wikiObj[u'header'][u'län'] if wikiObj[u'döljStadsdel'] and not wikiObj[u'stadsdel']: # only overwrite non existant wikiObj[u'stadsdel'] = wikiObj[u'header'][u'stadsdel'] # the following are limited in their values but need mapping from wiki to odok before comparison if wikiObj[u'fri'].lower() == 'nej': wikiObj[u'fri'] = 'unfree' if wikiObj[u'inomhus']: if wikiObj[u'inomhus'].lower() == 'ja': wikiObj[u'inomhus'] = 1 elif wikiObj[u'inomhus'].lower() == 'nej': wikiObj[u'inomhus'] = 0 else: log += 'unexpected value for inside-parameter (defaulting to no): %s\n' % wikiObj[u'inomhus'] wikiObj[u'inomhus'] = 0 else: wikiObj[u'inomhus'] = 0 if wikiObj[u'kommun']: # need muni code wikiObj[u'kommun'] = dataDict.muni_name2code[wikiObj[u'kommun']] if wikiObj[u'län'].startswith(u'SE-'): wikiObj[u'län'] = wikiObj[u'län'][len(u'SE-'):] if wikiObj[u'lat'] == '': wikiObj[u'lat'] = None else: if len(wikiObj[u'lat']) > 16: wikiObj[u'lat'] = '%.13f' % float(wikiObj[u'lat']) wikiObj[u'lat'] = wikiObj[u'lat'].strip('0') # due to how numbers are stored if wikiObj[u'lon'] == '': wikiObj[u'lon'] = None else: if len(wikiObj[u'lon']) > 16: wikiObj[u'lon'] = '%.13f' % float(wikiObj[u'lon']) wikiObj[u'lon'] = wikiObj[u'lon'].strip('0') # due to how numbers are stored if wikiObj[u'årtal'] == '': wikiObj[u'årtal'] = None # Deal with artists (does not deal with order of artists being changed): artist_param = [u'konstnär', u'konstnär2', u'konstnär3', u'konstnär4', u'konstnär5', u'konstnär6', u'konstnär7', u'konstnär8', u'konstnär9'] wikiObj[u'artists'] = '' artists_links = {} for a in artist_param: if wikiObj[a]: (w_text, w_links) = unwiki(wikiObj[a]) wikiObj[u'artists'] = u'%s%s;' % (wikiObj[u'artists'], w_text) if w_links: artists_links[w_text] = w_links[0] if wikiObj[u'artists']: wikiObj[u'artists'] = wikiObj[u'artists'][:-1] # trim trailing ; ## dealing with links: links = artists_links.values() if wikiObj[u'artikel']: if u'#' in wikiObj[u'artikel']: log += u'link to section: %s\n' % wikiObj[u'artikel'] else: links.append(wikiObj[u'artikel']) if links: links = wpApi.getPageInfo(links) for k, v in links.iteritems(): if u'disambiguation' in v.keys(): log += u'link to disambigpage: %s\n' % k links[k] = '' elif u'wikidata' in v.keys(): links[k] = v[u'wikidata'] else: links[k] = '' else: links = {} # Stick wikidata back into parameters if wikiObj[u'artikel']: if u'#' not in wikiObj[u'artikel']: wikiObj[u'artikel'] = links.pop(wikiObj[u'artikel']) else: wikiObj[u'artikel'] = '' wikiObj[u'artist_links'] = links.values() ## Main-process diff = {} # easy to compare {wiki:odok} trivial_params = {u'typ': u'type', u'material': u'material', u'id-länk': u'official_url', u'fri': u'free', u'inomhus': u'inside', u'artists': u'artist', u'årtal': u'year', u'commonscat': u'commons_cat', u'beskrivning': u'descr', u'bild': u'image', u'titel': u'title', u'aka': u'aka', u'artikel': u'wiki', u'list': u'list', u'plats': u'address', u'län': u'county', u'kommun': u'muni', u'stadsdel': u'district', u'tidigare': u'removed', u'lat': u'lat', u'lon': u'lon', u'fotnot': u'cmt'} for k, v in trivial_params.iteritems(): (w_text, w_links) = unwiki(wikiObj[k]) if not (w_text == odokObj[v]): diff[v] = {'new': w_text, 'old': odokObj[v]} if verbose: print u'%s:"%s" <---> %s:"%s"' % (k, w_text, v, odokObj[v]) ## Needing separate treatment # comparing artist_links: u'artist_links':u'artist_links' artist_diff = {'+': [], '-': []} artist_links = list(set(wikiObj[u'artist_links'])-set(odokObj[u'artist_links'])) if artist_links and len(''.join(artist_links)) > 0: artist_diff['+'] = artist_links[:] # slice to clone the list artist_links = list(set(odokObj[u'artist_links'])-set(wikiObj[u'artist_links'])) if artist_links and len(''.join(artist_links)) > 0: artist_diff['-'] = artist_links[:] # slice to clone the list # handler can only deal with new artists if len(artist_diff['-']) == 0 and len(artist_diff['+']) > 0: artIds = dbReadSQL.getArtistByWiki(artist_diff['+']) # list of id:{'first_name', 'last_name', 'wiki', 'birth_date', 'death_date', 'birth_year', 'death_year'} newArtistLinks = [] for k, v in artIds.iteritems(): artist_diff['+'].remove(v['wiki']) newArtistLinks.append(k) if len(newArtistLinks) > 0: diff[u'artist_links'] = {'new': newArtistLinks, 'old': []} # output remaining to log for k, v in artist_diff.iteritems(): if len(v) > 0: log += u'difference in artist links, linkdiff%s: %s\n' % (k, ';'.join(v)) ## akas if 'aka' not in diff.keys(): pass elif sorted(diff['aka']['new'].split(';')) == sorted(diff['aka']['old'].split(';')): del(diff['aka']) else: aka_diff = {'+': [], '-': []} aka_list = list(set(diff['aka']['new'].split(';'))-set(diff['aka']['old'].split(';'))) if aka_list and len(''.join(aka_list)) > 0: aka_diff['+'] = aka_list[:] # slice to clone the list aka_list = list(set(diff['aka']['old'].split(';'))-set(diff['aka']['new'].split(';'))) if aka_list and len(''.join(aka_list)) > 0: aka_diff['-'] = aka_list[:] # slice to clone the list # handler can only deal with new akas if len(aka_diff['-']) == 0 and len(aka_diff['+']) > 0: diff[u'aka_list'] = {'new': aka_diff['+'], 'old': []} del(aka_diff['+']) # output remaining to log for k, v in aka_diff.iteritems(): if len(v) > 0: log += u'difference in akas, diff%s: %s\n' % (k, ';'.join(v)) # remove these for now del(diff['aka']) ## Post-processing # fotnot-namn without fotnot - needs to look-up fotnot for o:cmt if wikiObj[u'fotnot-namn'] and not wikiObj[u'fotnot']: log += u'fotnot-namn so couldn\'t compare, fotnot-namn: %s\n' % wikiObj[u'fotnot-namn'] if u'cmt' in diff.keys(): del diff[u'cmt'] # free defaults to unfree in wiki but not necessarily in db if 'free' in diff.keys() and diff['free']['new'] == '': if diff['free']['old'] == 'unfree': diff.pop('free') # Years which are not plain numbers cannot be sent to db if 'year' in diff.keys(): if not common.is_int(diff['year']['new']): year = diff.pop('year') log += u'Non-integer year: %s\n' % year['new'] # lat/lon reqires an extra touch as only decimal numbers and nones may be sent to db if 'lat' in diff.keys(): if not diff['lat']['new']: # if new = None pass elif not common.is_number(diff['lat']['new']): lat = diff.pop('lat') log += u'Non-decimal lat: %s\n' % lat['new'] if 'lon' in diff.keys(): if not diff['lon']['new']: pass elif not common.is_number(diff['lon']['new']): lat = diff.pop('lon') log += u'Non-decimal lon: %s\n' % diff['lon']['new'] # Basic validation of artist field: if 'artist' in diff.keys(): # check that number of artists is the same if '[' in diff['artist']['old']: artist = diff.pop('artist') log += u'cannot deal with artists which include group affilitations: %s --> %s\n' % (artist['old'], artist['new']) elif (len(diff['artist']['old'].split(';')) != len(diff['artist']['new'].split(';'))) and (len(diff['artist']['old']) > 0): # if not the same number when there were originally some artists artist = diff.pop('artist') log += u'difference in number of artists: %s --> %s\n' % (artist['old'], artist['new']) # Unstripped refrences for k in diff.keys(): if k in (u'official_url', u'inside', u'removed'): # not strings or ok to have http continue if diff[k]['new'] and 'http' in diff[k]['new']: val = diff.pop(k) log += u'new value for %s seems to include a url: %s --> %s\n' % (k, val['old'], val['new']) return (diff, log)
def perform_semantic_inference(cluster_collection): """ This function performs semantic inference on a list of clusters given For each message in these clusters semantics are inferred by analyzing the token resp. its context. At the moment only two semantics are automatically inferred: numeric and IPv4 address TODO: Add more semantics, e.g. EOL identifier, lenght fields, ... """ # Try to perform semantic inferences # Walk through every cluster and check messages for obvious results cluster = cluster_collection.get_all_cluster() for c in cluster: messages = c.get_messages() for message in messages: tokenlist = message.get_tokenlist() iterator = peekable(tokenlist) idx = 0 while not iterator.isLast(): #for tokenRepresentation in tokenlist: tokenRepresentation = iterator.next() # TODO: do we need to keep semantics which involve multiple cluster? e.g. sessionids? previous_semantics = tokenRepresentation.get_semantics() tokenRepresentation.set_semantics( []) # Clear existing semantics from previous run #for s in previous_semantics: # if s.startswith("sessionid"): # tokenRepresentation.add_semantic(s) # break if "sessionid" in previous_semantics: # Check if we have at least 2 messages and we are not of type Const if len(messages) > 1 and c.get_format( idx) != Message.typeConst: tokenRepresentation.add_semantic("sessionid") if "FD" in previous_semantics: tokenRepresentation.add_semantic("FD") token = tokenRepresentation.get_token() # Check whether it is numeric try: isNumber = tokenRepresentation.get_tokenType( ) == Message.typeText and common.is_number(token) except TypeError: if Globals.getConfig().debug: print "Error checking token {0} for number semantics".format( token) isNumber = False if isNumber: tokenRepresentation.add_semantic("numeric") #c.add_semantics(idx,"numeric") #print "Inferred semantic inference 'numeric' for token ", token # Check whether it is an IP address if isinstance(token, str) and common.is_ipv4(token): tokenRepresentation.add_semantic("ipv4 address") # Do not add to cluster unless it is valid for all c.add_semantics(idx,"ipv4 address") #print "Inferred semantic inference 'ipv4 address' for token ", token # Check for carriage return identifiers # When 0d is followed by 0a we've got a CR-LF # Sensible? When 0d or 0a is the last token, we've got a single CR resp LF # In all other cases assume 0d/0a is just a hex value of the protocol if token == 0xd: nextOne = iterator.peek() if isinstance(nextOne, TokenRepresentation): if nextOne.get_token() == 0xa: inferred_formats = c.get_format_inference() if inferred_formats[idx].getType( ) == Message.typeConst and inferred_formats[ idx + 1].getType() == Message.typeConst: tokenRepresentation.add_semantic("CR") #c.add_semantics(idx,"CR") nextOne = iterator.next() nextOne.set_semantics(["LF"]) #c.add_semantics(idx+1, "LF") idx += 1 idx += 1 # Perform other tests like "is length field?" # explicitely iterate through all messages like stated in the paper # we could also postpone this to the call of 'pushToClusterSeminatics" but.. reference_message = messages[0] tokenlist = reference_message.get_tokenlist() idx = 0 for tokenRepresentation in tokenlist: if tokenRepresentation.get_tokenType( ) == Message.typeBinary and idx + 1 < len(tokenlist): ref_value = tokenRepresentation.get_token() if not tokenlist[idx + 1].get_tokenType( ) == Message.typeText: # We require that the next token is the text token in question idx += 1 continue ref_next_length = tokenlist[idx + 1].get_length() if not ref_value == ref_next_length: # This is no length field idx += 1 continue ref_message_length = reference_message.get_length() is_length = True for message in messages: cmp_value = message.get_tokenlist()[idx].get_token() cmp_next_length = message.get_tokenlist()[idx + 1].get_length() cmp_message_length = message.get_length() try: diff_val = abs(cmp_value - ref_value) except TypeError: # Could happen if a short text token is mistaken as a binary value break diff_next_length = abs(cmp_next_length - ref_next_length) # The next line also takes total msg length differences into account. This might not be true for # all protocols diff_msg_length = abs(cmp_message_length - ref_message_length) if Globals.getConfig( ).requireTotalLengthChangeForLengthField: if not (diff_val == diff_next_length == diff_msg_length): is_length = False break else: if not (diff_val == diff_next_length): is_length = False break if is_length: # set "lengthfield" semantic for every message in the cluster at the given position for message in messages: # TODO: What if there's only one message in the cluster? Sensible? message.get_tokenlist()[idx].add_semantic( "lengthfield") c.add_semantic_for_token(idx, "lengthfield") idx += 1 # Try to identify sessionid fields reference_message = messages[0] nextInFlow = reference_message.getNextInFlow() if nextInFlow != None and not ( len(messages) == 1 and Globals.getConfig( ).sessionIDOnlyWithClustersWithMoreThanOneMessage): tokenlist = reference_message.get_tokenlist() next_tokenlist = nextInFlow.get_tokenlist() ref_idx = 0 for tokenRepresentation in tokenlist: tokType = tokenRepresentation.get_tokenType() # If its not a binary, it cannot be a cookie if tokType != Message.typeBinary: ref_idx += 1 continue fmt = c.get_format(ref_idx) # If its a binary but const, it cannot be a cookie if fmt[1] == Message.typeConst: ref_idx += 1 continue # Set reference value ref_val = tokenRepresentation.get_token() # Walk next flow for reference value next_idx = 0 for next_tokenRepresentation in next_tokenlist: # Retrieve next token type nextTokType = next_tokenRepresentation.get_tokenType() # If it is not a binary we don't see it as a cookie if Globals.getConfig().sessionIDOnlyWithBinary: if nextTokType != Message.typeBinary: next_idx += 1 continue next_cluster = nextInFlow.getCluster() # Get format of comparating message comp_fmt = next_cluster.get_format(next_idx) # If it is const, it cannot be a sessonid if comp_fmt[1] == Message.typeConst: next_idx += 1 continue # Load comparator value comp_val = next_tokenRepresentation.get_token() if ref_val == comp_val: # We've got a potential hit, now compare all messages for the same idx pairs isCookie = True for cmp_ref_msg in messages: if not isCookie: break if cmp_ref_msg == messages[ 0]: # Skip first message (we've already checked that one continue cmp_ref_tok_list = cmp_ref_msg.get_tokenlist() cmp_ref_val = cmp_ref_tok_list[ref_idx].get_token() cmp_cmp_msg = cmp_ref_msg.getNextInFlow() if cmp_cmp_msg == None: isCookie = False else: cmp_cmp_tok_list = cmp_cmp_msg.get_tokenlist() if next_idx >= len(cmp_cmp_tok_list): # Obviously "next" points to messages in different clusters # so the len might differ from the reference next cluster # used to find our reference cookie value # Therefore this cannot be a cookie isCookie = False continue # Make sure the comparing token is also not constant cmp_cmp_fmt = cmp_cmp_msg.getCluster( ).get_format(next_idx) # If it is const, it cannot be a sessonid if cmp_cmp_fmt == Message.typeConst: isCookie = False continue # Finally compare the values cmp_cmp_val = cmp_cmp_tok_list[ next_idx].get_token() if (cmp_ref_val != cmp_cmp_val) or ( (cmp_ref_val == cmp_cmp_val) and (cmp_ref_val == ref_val)): isCookie = False if isCookie: # Set cookie semantic in this message and the other #sessionid = uuid.uuid1() for message in messages: # Set for every message and the cluster itself #message.get_tokenlist()[ref_idx].add_semantic("sessionid_{0}".format(sessionid)) message.get_tokenlist()[ref_idx].add_semantic( "sessionid") nextMsg = message.getNextInFlow() #nextMsg.get_tokenlist()[next_idx].add_semantic("sessionid_{0}".format(sessionid)) nextMsg.get_tokenlist()[next_idx].add_semantic( "sessionid") c.add_semantic_for_token(ref_idx, "sessionid") #c.add_semantic_for_token(ref_idx,"sessionid_{0}".format(sessionid)) next_idx += 1 ref_idx += 1 # Try to find random fields (16 bit) token_formats = c.get_formats() idx = 0 for token_format in token_formats: rep, form, semantics = token_format if form.getType( ) == Message.typeVariable and rep == Message.typeBinary: try: variance = c.getVariableStatistics()[idx].getVariance() except Exception: pass if variance > 1000 and len(semantics) == 0: # We've got a very high variance and no assigned semantics --> candidate for random # Have a look at the last but one token if idx - 1 >= 0: rep, form, semantics = token_formats[idx - 1] if form.getType( ) == Message.typeVariable and rep == Message.typeBinary: stats = c.getVariableStatistics()[idx - 1] if stats != None: variance2 = stats.getVariance() else: logging.error( "Did not receive cluster statistics for token {0} (len of formats {1}, len of stats {2})" .format(idx, len(token_formats), len(c.getVariableStatistics()))) idx += 1 continue if variance2 > 1000 and len(semantics) == 0: # Consider the two as a CRC-16 for message in messages: # Set for every message and the cluster itself message.get_tokenlist()[ idx - 1].add_semantic("random") message.get_tokenlist()[idx].add_semantic( "random") c.add_semantic_for_token(idx - 1, "random") c.add_semantic_for_token(idx, "random") idx += 1 # Try to find sets (valued limited in variability with lower and upper bound) token_formats = c.get_formats() idx = 0 for token_format in token_formats: rep, form, semantics = token_format if form.getType() == Message.typeVariable: stats = c.getVariableStatistics()[idx] if stats != None: distinct = stats.numberOfDistinctSamples() else: logging.error( "Did not receive cluster statistics for token {0} (len of formats {1}, len of stats {2})" .format(idx, len(token_formats), len(c.getVariableStatistics()))) idx += 1 continue # How will be find out whether a number of variable values is a set or really variable? # We assume that there is an absolute maximum amount of distinct values which is independent # of the actual number of messages. However we also need to consider that when the number of # messages in a cluster definitily falls below the setAbsoluteMax value, we have to adapt # the local maximum in this cluster. # For the moment we take a multiplier for the number of messages (default 0.3 == 30%) and # assume it is a set, when both, setAbsoluteMax and the localThreshold is underrun # In addition we assume that we have no semantics for this token, as other semantics conflict # with the notion of a set if (distinct <= Globals.getConfig().setAbsoluteMax and distinct <= (c.getNumberOfMessages() * Globals.getConfig().setPercentageThreshold) and len(semantics) == 0): for message in messages: # Set for every message and the cluster itself message.get_tokenlist()[idx].add_semantic("set") c.add_semantic_for_token(idx - 1, "set") idx += 1 # Push to cluster pushUpToCluster(cluster_collection)
def updatesToDatabase(odok, wiki, quick=False): ''' given a wiki-entry which has been matched to an odok object this checks whether any of the wikiinfo should be added to the odok object and prepares an update statement. setting quick to true puts any updates requiring decision making into the postponed output ''' wpApi = wikiApi.WikiApi.setUpApi(user=dconfig.w_username, password=dconfig.w_password, site=dconfig.wp_site) updated = {} postponed = {} linked_artists = {} mapping = { u'namn': 'title', u'skulptör': 'artist', u'årtal': 'year', u'material': 'material', u'plats': 'address', u'header': 'district', u'lat': 'lat', u'lon': 'lon', u'bild': 'image', u'typ': 'type' } # non-trivial mappings u'namn_link':'wiki' for w in wiki: if not w['id']: continue o = odok[w['id']] changes = {} skipped = {} for k, v in mapping.iteritems(): if not k in w.keys( ): # for postponed file some fields might be missing continue no_Tags, dummy = common.extractLink(w[k], kill_tags=True) if not no_Tags: # skip if w[k] is empty (or only a tag) continue if (not o[v]) and no_Tags: # trivial case of new info changes[v] = no_Tags elif o[v] and (not o[v].lower() == no_Tags.lower()): if quick: skipped[k] = w[k] else: # need to decide which to use print u'Diff for %s (%s): %s' % (w['id'], o['title'], v) print u' ödok: "%s"' % o[v] print u' wiki: "%s"' % w[k] while True: inChoice = raw_input(u'Use wiki [Y(es)/N(o)/S(kip)]:') if inChoice.lower() == u'n' or inChoice.lower( ) == u'no': break elif inChoice.lower() == u'y' or inChoice.lower( ) == u'yes': changes[v] = no_Tags break elif inChoice.lower() == u's' or inChoice.lower( ) == u'skip': skipped[k] = w[k] break # register any artist_links so that these can be compared to existing links if u'skulptör_link' in w.keys( ) and w[u'skulptör_link']: # postponed might not have u'skulptör_link' for a in w[u'skulptör_link']: if a in linked_artists.keys(): linked_artists[a].append(w['id']) else: linked_artists[a] = [ w['id'], ] # article_links must be checked manually since link may be depictive rather than of the actual object. if (u'namn_link' in w.keys() and w['namn_link'] ) and not o[u'wiki']: # postponed might not have u'namn_link' keys = w['namn_link'] print u'Potential title link for "%s" ("%s" on wiki)' % ( o['title'], w['namn']) for r in range(0, len(keys)): key = keys[r] print u'%r: "%s"' % (r, keys[r]) while True: inChoice = raw_input('Accept? [#/N]:') if inChoice == 'N' or inChoice == 'n': break elif common.is_number(inChoice) and int(inChoice) in range( 0, len(keys)): # NEW START wdInfo = wpApi.getPageInfo(keys[int(inChoice)], debug=True)[keys[int(inChoice)]] if 'wikidata' in wdInfo.keys( ) and wdInfo['wikidata']: # if exists and not empty changes[u'wiki'] = wdInfo['wikidata'] break # add changes if changes: updated[w['id']] = changes.copy() if skipped: postponed[w['id']] = skipped.copy() # end of wiki_object loop # Build new wikidata-module - # Build odok_write-module in the same spirit. Moving lots of writeToDatabase to that # om inte header, try page # plats_link? return (updated, postponed, linked_artists)
def findMatches(odok, wiki): ''' tries to find matches between scraped items and exisiting odok items identified matches has the odok id added to the wiki object TO DO: Expand to display several alternatives ''' # remove any id's which have already been identified matched_ids = [] for w in wiki: if w['id']: if w['id'] in matched_ids: print u'id %s was matched to more than one wiki object!' % w[ 'id'] else: matched_ids.append(w['id']) print u'%r out of %r already matched (out of a maximum of %r)' % ( len(matched_ids), len(wiki), len(odok)) # make lists of odok titles and artists odok_titles = {} odok_artist = {} odok_surname = {} for key, o in odok.iteritems(): if key in matched_ids: continue if o['title']: if o['title'] in odok_titles.keys(): odok_titles[o['title']].append(key) else: odok_titles[o['title']] = [ key, ] if o['artist']: if o['artist'] in odok_artist.keys(): odok_artist[o['artist']].append(key) else: odok_artist[o['artist']] = [ key, ] surname = wash(o['artist'].split(' ')[-1]) if surname in odok_surname.keys(): odok_surname[surname].append(key) else: odok_surname[surname] = [ key, ] # remove any id's which have already been identified for w in wiki: if w['id']: continue wIdN = None wIdA = None wIdS = None match = ([], '') if w['namn'] in odok_titles.keys(): wIdN = odok_titles[w['namn']] if w[u'skulptör'] in odok_artist.keys(): wIdA = odok_artist[w[u'skulptör']] if wash(w[u'skulptör'].split(' ')[-1]) in odok_surname.keys(): wIdS = odok_surname[wash(w[u'skulptör'].split(' ')[-1])] if wIdN and wIdA: # match on both title and artist if len(wIdN) == 1: if wIdN[0] in wIdA: match = ([wIdN[0]], 'double match') else: match = ([wIdN[0]], 'title match but artist missmatch') else: for nId in wIdN: if nId in wIdA: match = ([nId], 'Non-unique title with artist match') break elif wIdN: # match on title only match = (wIdN, 'titel match') elif wIdA: # match on artist only match = (wIdA, 'artist match') elif wIdS: # last ditch attempt matching surname. match = (wIdS, 'surname match') # always check this of no match? # replace do "nice search" with ss->s # explicitly ask for verification for each match if match[0]: keys = match[0] print u'%s: (%s)' % (match[1], ' | '.join(keys)) print u'W: "%s", "%s", (%s), "%s"' % (w[u'namn'], w[u'skulptör'], w[u'årtal'], w['plats']) for r in range(0, len(keys)): key = keys[r] print u'%r: "%s", "%s", (%s), "%s"' % ( r, odok[key]['title'], odok[key][u'artist'], odok[key][u'year'], odok[key][u'address']) while True: inChoice = raw_input('Accept? [#/N]:') if inChoice == 'N' or inChoice == 'n': break elif common.is_number(inChoice) and int(inChoice) in range( 0, len(keys)): w['id'] = keys[int(inChoice)] break