def generate_inputs_from_constraints(constraint_dict, min_size, max_string_size=100): #sample a size from min to max size = random.randint(min_size, max_string_size) indices = set(range(size)) slist = random.choices(printable[:-4], k=size) # schematically: #print("min_size", min_size) #print("size", size) for item in constraint_dict: #print("ITEM", item) #print("sliststr:", ''.join(slist)) num_to_insert = max( 0, constraint_dict[item] - len(re.findall(re.escape(item), ''.join(slist)))) #print("num_to_insert", num_to_insert) if len(indices) < num_to_insert: return None indices_to_insert = set(random.sample(indices, k=num_to_insert)) # do something here #print("PREG INPUT",item if item not in preg_dict else preg_dict[item]) for i in indices_to_insert: slist[i] = pre.create( item).sample() if item not in preg_dict else pre.create( preg_dict[item]).sample() indices = indices - indices_to_insert #may be too big but whatever string = ''.join(slist) if len(string) > max_string_size: return string[:max_string_size] # may break but whatever return string
def match_col(dataset, rstring): r = pregex.create(rstring) matches = [] for i, col in enumerate(dataset): score = sum([r.match(example) for example in col]) if score != float('-inf'): matches.append(i) return matches
def regex_plus_bound(X): from pregex import pregex c = Counter(X) regexes = [ pregex.create(".+"), pregex.create("\d+"), pregex.create("\w+"), pregex.create("\s+"), pregex.create("\\u+"), pregex.create("\l+")] regex_scores = [] for r in regexes: regex_scores.append(sum(c[x] * r.match(x) for x in c)/float(sum([len(x) for x in X])) ) return max(regex_scores)