def compile(self): ''' Compile the text files to DDStorm modules. ''' self.source = set() self.custom = set() self.alias = Alias(self._conf) # Loop over library files and add *.txt files to source for path, subdirs, files in os.walk(self._conf.get("library_path")): for name in files: if (fnmatch(name, "*.txt")): self.source.add(os.path.join(path, name)) # Loop over custom files and add *.txt files to custom for path, subdirs, files in os.walk(self._conf.get("custom_path")): for name in files: if (fnmatch(name, "*.txt")): self.custom.add(os.path.join(path, name)) # Create module directory if not already present and delete all module files if (not os.path.isdir(self._conf.get("module_path"))): os.makedirs(self._conf.get("module_path")) for f in os.listdir(self._conf.get("module_path")): if (fnmatch(f, "*.module")): os.unlink(self._conf.get("module_path") + f) # Create a regex for calculating priority from filename self.priorityRegex = re.compile("(?<=\.)\d+$") # First sort files by priority then compile them to module for src in self._sortPriority(self.source): self._makeModule(src) for src in self._sortPriority(self.custom): self._makeModule(src)
def main (args, app): for alias in Alias.FromConfig(app.config): if args.name == alias.name: alias.remove(app.config) app.config.save( ) print ('removed', alias.format_url( )) break
def compile(self): ''' Compile the text files to DDStorm modules. ''' self.source=set() self.custom=set() self.alias=Alias(self._conf) # Loop over library files and add *.txt files to source for path, subdirs, files in os.walk(self._conf.get("library_path")): for name in files: if(fnmatch(name, "*.txt")): self.source.add(os.path.join(path, name)) # Loop over custom files and add *.txt files to custom for path, subdirs, files in os.walk(self._conf.get("custom_path")): for name in files: if(fnmatch(name, "*.txt")): self.custom.add(os.path.join(path, name)) # Create module directory if not already present and delete all module files if(not os.path.isdir(self._conf.get("module_path"))): os.makedirs(self._conf.get("module_path")) for f in os.listdir(self._conf.get("module_path")): if(fnmatch(f, "*.module")): os.unlink(self._conf.get("module_path")+f) # Create a regex for calculating priority from filename self.priorityRegex=re.compile("(?<=\.)\d+$") # First sort files by priority then compile them to module for src in self._sortPriority(self.source): self._makeModule(src) for src in self._sortPriority(self.custom): self._makeModule(src)
def recv_captcha(self, mailfrom, msg): """ Receives and verifies captcha. """ subject = msg['Subject'] orgidentifier = string.split(subject,' ')[-1] logging.debug('Orig CAPTCHA identifier\t: %s', orgidentifier) # TODO: Reject if original identifier is not in DB try: if msg.is_multipart(): answer = msg.get_payload(0).get_payload().splitlines()[0].strip() else: answer = msg.get_payload().splitlines()[0].strip() except: return identifier = self.db.hash_data(Address(mailfrom).address, answer) match = self.db.get_captcha_word(identifier) if match != None and match['word'] == answer: # Update captcha status to CAPTCHA_APPROVED cid, rid, word = match adata = self.db.get_alias_data(rid) aobj = Alias(**adata) user = User(**self.db.get_user(uid=aobj.get_uid())) # send message to recipient's alias requesting mailfrom's permission to send msg = UserMessage('senderverify.senduserreq', \ fromaddx = self.cfg.SVCALIAS, \ aliasaddx = aobj.get_alias_address(), \ useraddx = user.get_account_address(), \ requestor = mailfrom) msg.generate_message_id(self.cfg.DOMAIN) self.db.set_captcha(msg['Message-ID'], '', cid, rid, self.db.CAPTCHA_APPROVED) logging.debug('Sending approval request to user %s', user.get_username()) self.send(msg['From'], [user.get_forwarding_address()], msg) # Delete identifier from database #self.db.delete_captcha_identifier(identifier) else: # TOFIX: should replace with new captcha and increment numtries; pass
def _build_alias(self): template = Environment( loader=FileSystemLoader(self.TEMPLATES_PATH)).get_template( self.VARIABLE_TEMPLATE) alias_map = Alias(self._base_path).get_map() with open('_alias.py', 'wb') as f: f.write( template.render( dict(varName='g_aliasMap', value=repr(alias_map)))) return VariablesLengthHelper(alias_map)
def parseMSOffice2011Plist(self, mru_file): plist = self.load_bplist(mru_file) if plist == None: return [] aliases = [] try: for n,item in enumerate(plist["14\File MRU\MSWD"]): aliases.append(Alias(data=item["File Alias"]).parse()) except: pass try: for n,item in enumerate(plist["14\File MRU\XCEL"]): aliases.append(Alias(data=item["File Alias"]).parse()) except: pass try: for n,item in enumerate(plist["14\File MRU\PPT3"]): aliases.append(Alias(data=item["File Alias"]).parse()) except: pass return aliases
def parseSidebarlistsPlist(self, mru_file): plist = self.load_bplist(mru_file) if plist == None: return [] aliases = [] try: for n,item in enumerate(plist["systemitems"]['VolumesList']): try: aliases.append(Alias(data=plist["systemitems"]['VolumesList'][n]['Alias']).parse()) except Exception as e: pass except: pass try: for n,item in enumerate(plist["favorites"]['VolumesList']): try: pass aliases.append(Alias(data=plist["systemitems"]['VolumesList'][n]['Alias']).parse()) except: pass except: pass return aliases
def parseRecentItemsPlist(self, mru_file): plist = self.load_bplist(mru_file) if plist == None: return [] bookmarks = [] aliases = [] try: for n,item in enumerate(plist["RecentApplications"]["CustomListItems"]): bookmarks.append(Bookmark(data=item["Bookmark"]).parse()) except: pass try: for n,item in enumerate(plist["RecentDocuments"]["CustomListItems"]): bookmarks.append(Bookmark(data=item["Bookmark"]).parse()) except: pass try: for n,item in enumerate(plist["RecentServers"]["CustomListItems"]): bookmarks.append(Bookmark(data=item["Bookmark"]).parse()) except: pass try: for n,item in enumerate(plist["Applications"]["CustomListItems"]): aliases.append(Alias(data=item["Alias"]).parse()) except: pass try: for n,item in enumerate(plist["Documents"]["CustomListItems"]): aliases.append(Alias(data=item["Alias"]).parse()) except: pass try: for n,item in enumerate(plist["Servers"]["CustomListItems"]): aliases.append(Alias(data=item["Alias"]).parse()) except: pass return bookmarks, aliases
class Profile(namedtuple('Profile', 'v e iv ie sim')): _aliases = Alias(set) _profiles = { 'JdbcTableScan': tablescan, 'JdbcProjectRel': projection, 'JdbcFilterRel': selection, 'JdbcJoinRel': join, 'JdbcAggregateRel': aggregate, 'JdbcToEnumerableConverter': jdbctoenumerate } @classmethod def _antialias(cls, columns): return set(flat(map(cls._aliases, columns))) @classmethod def build(cls, node, inputs): return cls._profiles[node.get('relOp')](node, inputs)
def parseFinderPlist(self, mru_file): plist = self.load_bplist(mru_file) if plist == None: return [] bookmarks = [] aliases = [] try: for n,item in enumerate(plist["FXRecentFolders"]): try: bookmarks.append(Bookmark(data=item["file-bookmark"]).parse()) except: pass try: pass aliases.append(Alias(data=item["file-data"]["_CFURLAliasData"]).parse()) except: pass except: pass return bookmarks, aliases
uid = row[0] login = row[1].strip() name = row[2] user_type = row[7].strip() location = row[4] email = row[5] unmask[uid] = uid m = fakeusr_rex.search(login) if m is not None: record_type = USR_FAKE else: record_type = USR_REAL a = Alias(record_type, uid, login, name, email, location, user_type) aliases[uid] = a # - email d_uid_email[a.uid] = a.email if a.email is not None: d_email_uid.setdefault(a.email, set([a.uid])) d_email_uid[a.email].add(a.uid) # - prefix d_uid_prefix[a.uid] = a.email_prefix d_uid_comp_prefix[a.uid] = a.email_prefix if a.email_prefix is not None: if len(a.email_prefix.split('.')) > 1 or len( a.email_prefix.split('_')) > 1: d_comp_prefix_uid.setdefault(a.email_prefix, set([a.uid]))
def get_alias_map (conf): aliases = { } for alias in Alias.FromConfig(conf): aliases[alias.name] = alias return aliases
def add_alias(self, left: str, right: str): inst = Alias("{}_{}".format(left, right), self, self.get_net(right), self.get_net(left)) self._instances["{}_{}".format(left, right)] = inst return inst
class Compile: ''' This class creates a compiler for the DDStorm that compiles the text files containing list of differential diagnosis to simplified modular data files usable by the program. ''' def __init__(self, conf=False): ''' The constructor optionally accepts a configuration. If none is provided it creates a default configuration. Parameters: conf - A dictionary containing configuration options ''' if(conf): self._conf=conf else: self._conf=Conf() self.clean=True def compile(self): ''' Compile the text files to DDStorm modules. ''' self.source=set() self.custom=set() self.alias=Alias(self._conf) # Loop over library files and add *.txt files to source for path, subdirs, files in os.walk(self._conf.get("library_path")): for name in files: if(fnmatch(name, "*.txt")): self.source.add(os.path.join(path, name)) # Loop over custom files and add *.txt files to custom for path, subdirs, files in os.walk(self._conf.get("custom_path")): for name in files: if(fnmatch(name, "*.txt")): self.custom.add(os.path.join(path, name)) # Create module directory if not already present and delete all module files if(not os.path.isdir(self._conf.get("module_path"))): os.makedirs(self._conf.get("module_path")) for f in os.listdir(self._conf.get("module_path")): if(fnmatch(f, "*.module")): os.unlink(self._conf.get("module_path")+f) # Create a regex for calculating priority from filename self.priorityRegex=re.compile("(?<=\.)\d+$") # First sort files by priority then compile them to module for src in self._sortPriority(self.source): self._makeModule(src) for src in self._sortPriority(self.custom): self._makeModule(src) def _sortPriority(self, files): ''' Sort data files based on their priority settings. ''' ls=[] # Loop over the files for addr in files: # Format the file name name=os.path.splitext(os.path.basename(addr))[0].lower().replace("_"," ").replace("-", " ") # Search for priority tag on file name m=re.search(self.priorityRegex, name) # Add to ls as (symptom name, priority number, file name) with default priority of 100 if(m): ls.append((name.replace("."+m.group(), ""), int(m.group()), addr)) else: ls.append((name, 100, addr)) # Sort the file list, first by the symptom name, then by the priority number ls.sort(reverse=True) if(ls): return(list(zip(*ls))[2]) else: return ls def _makeModule(self, src): ''' Create application usable modules from data files. ''' # Format the file name module=os.path.splitext(os.path.basename(src))[0].lower().replace("_"," ").replace("-", " ") # Remove the priority tag from file name m=re.search(self.priorityRegex, module) if(m): module=module.replace("."+m.group(), "") # Create the module file name modFile=self._conf.get("module_path")+module+".module" modFlag=False # Loop over both files, the source data file and the target module file with open(src, "r") as sf, open(modFile, "a") as tf: # Ignore lines starting with ! or #, + and - has special meaning, write other lines to module. Log the errors. for line in sf: line=line.strip().split("#")[0] if(len(line)==0): pass elif(line.startswith("!")): pass elif(line.startswith("#")): pass elif(line.startswith("+")): modFlag=True elif(line.startswith("-")): modFlag=True elif(line.replace(" ","").replace("-","").replace("_","").replace("'","").isalnum()): print(self.alias.get(line).capitalize(), file=tf) else: self.clean=False logging.warning("Syntax error in file '"+src+"': "+line) # Deal with special lines if(modFlag): modFlag=False with open(src, "r") as f: for line in f: line=line.strip().split("#")[0] if(line[1:].replace(" ","").replace("-","").replace("_","").replace("'","").isalnum()): # If line starts with + add it to the module file if(line.startswith("+")): with open(modFile, "r") as fn: text=fn.read() with open(modFile, "w") as fn: print(self.alias.get(line[1:]).capitalize()+"\n"+text, file=fn) # If line starts with - remove corresponding item from the module file elif(line.startswith("-")): with open(modFile, "r") as fn: text=fn.read() text=text.replace(self.alias.get(line[1:]).capitalize()+"\n", "") with open(modFile, "w") as fn: print(text, file=fn) def is_clean(self): '''Report if compilation ended successfully''' return self.clean
def apply_aliasing(self, user, mailfrom, rcpttos, msg): """ Applies an alias as the sender of a message or attempts to infer it if none was given by the sender. Handles cases 1d and 1e of the specification. """ usralias = None alias_addx = None is_alias_address = lambda entry: entry.parse_alias_address() logging.debug('Attempting to apply aliasing') # look for use of existing alias in To field (case 1d); for cur_addx in msg.search_header_addresses('to', is_alias_address): alias_pair = cur_addx.parse_alias_address() alias_data = self.db.get_alias_data(*alias_pair, \ uid=user.get_uid()) if not alias_data: continue usralias = Alias(**alias_data) alias_addx = cur_addx #if not usralias.is_active(): # continue # remove alias from rcpttos and all To fields for i in range(len(rcpttos)): if alias_addx == rcpttos[i]: del rcpttos[i] break msg.replace_address('to', alias_addx, None) break # if no alias in To field, try to infer the correct one (case 1e); if not alias_addx: logging.debug("Couldn't find alias to use in headers; " \ 'attempting to infer correct alias') alias_data = self.db.infer_alias(user.get_uid(), msg.get_header_addresses('to'), user.get_salt()) if not alias_data: logging.debug('Failed to infer alias') err = ErrorMessage('applyalias.noinfer', fromaddx = self.cfg.SVCALIAS, toaddx = user.get_account_address(), subject = msg['Subject']) self.send(err['From'], [user.get_forwarding_address()], err) return False usralias = Alias(**alias_data) #if not usralias.is_active(): # return False logging.debug('Succesfully inferred alias "%s"', str(usralias)) alias_addx = Address(usralias.get_alias_address()) # if we found an alias to use, apply it, send the # message, and record in history table; alias_addx.realname = Address(mailfrom).realname msg.replace_address('from', None, alias_addx) #del msg['message-id'] if rcpttos == []: logging.info('No recipients left; ignoring'); return rcpt_aliases = [] rcpt_nonaliases = [] for entry in rcpttos: rcpt_addx = Address(entry) if rcpt_addx.is_servername(): rcpt_aliases.append(entry) else: rcpt_nonaliases.append(entry) self.send(str(alias_addx), rcpt_nonaliases, msg) self.forward(str(alias_addx), rcpt_aliases, msg) self.db.add_history(usralias.get_rid(), True, address.getaddresses(rcpttos), msg['Message-ID'], user.get_salt()) return
def forward(self, mailfrom, rcpttos, msg): """ Handles Case 2, where email is not from a service user and so needs to be forwarded to various aliases. """ for rcpt in rcpttos: prcpt = Address(rcpt) alias_pair = prcpt.parse_alias_address() logging.debug(rcpt) if not alias_pair: # if the domain is SERVERNAME, sender screwed up; return error to sender... if prcpt.is_servername(): logging.info('Encountered improperly formatted ' 'address "%s" in recipients field', prcpt.address) # Create error response message err = ErrorMessage('forward.badformat', fromaddx = self.cfg.SVCALIAS, toaddx = mailfrom, badalias = prcpt.address) self.send(err['From'], [mailfrom], err) # ... otherwise ignore; not our job to send to non-users logging.info('Encountered recipient outside our domain; ignoring') else: alias_data = self.db.get_alias_data(*alias_pair) if alias_data: fwd_alias = Alias(**alias_data) userdata = self.db.get_user(uid=fwd_alias.get_uid()) assert userdata is not None user = User(**userdata) logging.debug('is trusted? %s', fwd_alias.is_trusted()) # handle trustedness here; if not fwd_alias.is_trusted(): mfrom = Address(mailfrom) # if sender is in trusted group, then it's all right; if self.db.is_trusted_correspondent(mfrom, \ user.get_salt(), \ fwd_alias.get_rid(), \ fwd_alias.get_trusted_timestamp()): pass # TODO: send/append something about newer alias to send to? else: capstat = self.db.get_capstat(mfrom, \ user.get_salt(), \ fwd_alias.get_rid()) logging.debug('capstat=%s', capstat) if capstat < self.db.CAPTCHA_PENDING: logging.debug('captcha not yet sent; trying to send one') # If not approved, send captcha to sender and drop mail. # TODO: Perhaps we can cache the mail somewhere. cid = self.db.get_cid(mfrom, user.get_salt()) self.send_captcha(mailfrom, cid, fwd_alias) #self.db.set_capstat(cid, # fwd_alias.get_rid(), # self.db.CAPTCHA_PENDING) logging.debug('done sending captcha') elif capstat == self.db.CAPTCHA_PENDING: logging.debug('captcha was already sent; still waiting for solution') elif capstat == self.db.CAPTCHA_APPROVED: logging.debug('captcha approved, but not yet user approved') # if user denied, # TODO: just ignore? or do something more? # pass # if user judgement pending, send message # informing them they must wait for user's approval? if capstat == self.db.USER_PENDING: pass # TODO: send message return # TODO: can consult a whitelist/blacklist/etc. here fwd_addx = Address(user.get_forwarding_address()) fwd_addx.realname = prcpt.realname logging.info('Found alias for account (%s) Forwarding message to %s', \ user.get_username(), fwd_addx.address) # Add hint as recipient name. The hint/domain is used as a reminder # to the user where this email address was originally created for. # But since we did not update Reply-To, it will drop off when the # user replies to the message. rcptaddr = Address(rcpt) if rcptaddr.get_realname() == '': if fwd_alias.get_hint() != None: rcptaddr.set_realname(fwd_alias.get_hint()) elif fwd_alias.get_domain() != None: rcptaddr.set_realname(fwd_alias.get_domain()) msg.replace_address('To', rcpt, rcptaddr) acct_addx = Address(user.get_account_address()) acct_addx.realname = prcpt.realname #del msg['message-id'] #del msg['DKIM-Signature'] if 'To' in msg: msg.replace_header('To', msg['To'] + ', ' + str(acct_addx)) if 'Reply-To' in msg: msg.replace_header('Reply-To', msg['reply-to'] + ', ' + rcpt); else: msg.add_header('Reply-To', mailfrom + ', ' + rcpt); if 'Message-ID' not in msg: msg.generate_message_id(self.cfg.DOMAIN) self.send(mailfrom, [str(fwd_addx)], msg) self.db.add_history(fwd_alias.get_rid(), False, [Address(mailfrom)], msg['Message-ID'], user.get_salt()) else: logging.info("Couldn't find data for alias (%s,%d)", *alias_pair) return
def create_alias_helper(self, user, aliasname, \ primary=False, rcpt=None, trusted=True, hint=None): """ Helper function to create alias. Generates <rand> for user for the <aliasname> specified. If <aliasname> belonging to the user already exists, the existing aid is used. If <aliasname> belonging to another user already exists, an error is returned. """ (aid, uid) = self.db.get_aliasname_data(aliasname) # Error if user doesn't own the aliasname if uid != None and uid != user.get_uid(): logging.info('User %d does not own "%s".', user.get_uid(), aliasname) # Create error response message err = ErrorMessage('createalias.notowner', fromaddx = self.cfg.GETALIAS, toaddx = user.get_account_address(), aliasname = aliasname) self.send(err['From'], [user.get_forwarding_address()], err) return None # # Now, aliasname either belongs to the user or is not in use. # # Gets the alias id, either by getting an existing one or create a new one. if uid == user.get_uid(): newaid = aid logging.debug('Using existing aid %d for aliasname "%s"', \ newaid, aliasname) elif uid == None: newaid = self.db.insert_alias(user.get_uid(), aliasname, primary) logging.debug('Created new aid %d for aliasname "%s"', \ newaid, aliasname) else: return None # # If a recipient is given, check history to see if there was any # previously generated <rand> that we can use. # TODO: We might have to make sure the recipient is active. # newalias = None cid = None if rcpt != None: cid = self.db.peek_cid(rcpt, user.get_salt()) rid = None if cid != None: rid = self.db.get_history_rid(aliasname, cid) if rid != None: # Found a history correspondence hist_alias = Alias(self.db.get_alias_data(rid)) hist_aliasname, hist_aliasrand = hist_alias.get_alias_pair() logging.debug('History aliasname\t:"%s"', hist_aliasname) if hist_aliasname == aliasname: logging.debug('Reuse history aliasrand\t:"%s"', \ hist_aliasrand) newalias = Alias(hist_aliasname, hist_aliasrand) else: # Can't use the rid found since aliasname differs rid = None # Create a new alias (aka aliasrand or <aliasname>.<rand>) if newalias == None: logging.debug('Generating new aliasrand') newalias = Alias(aliasname, alias.generate_rint()) logging.debug('Using alias\t\t: %s', newalias) # Update aid, uid and set isactive for new alias newalias.set_values(aid=newaid, uid=user.get_uid(), isactive=1) # Sets up alias pair alias_pair = newalias.get_alias_pair() # If we don't have rid yet, insert aliasrand to DB and mark as active if rid == None: rid = self.db.insert_aliasrnd(user.get_uid(), \ newaid, \ alias_pair[0], alias_pair[1], \ 1, trusted, hint) if rid == None: return None # Looks like this double counts in the history table; #if rcpt != None: # self.db.add_history(rid, True, [rcpt], user.get_salt()) # Creates the alias address, which includes the domain aliasaddx = Address(newalias.get_alias_address()) logging.info('Aliasrnd Address\t\t: %s', str(aliasaddx)) return aliasaddx
def resolve_aliases(slug, inputs): print_flag = 0 #out = open("merge_dump/"+slug.replace("/", "_____"), "w") unmask = {} aliases = {} # Helper structures d_email_uid = {} d_uid_email = {} d_prefix_uid = {} d_uid_prefix = {} d_comp_prefix_uid = {} d_uid_comp_prefix = {} d_uid_domain = {} d_domain_uid = {} d_name_uid = {} d_uid_name = {} d_name_parts_uid = {} d_uid_name_parts = {} d_name_app_uid = {} d_uid_app_parts = {} d_login_uid = {} d_uid_login = {} d_location_uid = {} d_uid_location = {} d_uid_type = {} #d_type_usr = {} uid = 0 # raw = {} for ind, row in inputs.iterrows(): uid = row["id"] name = row["name"] email = row["email"] # raw[uid] = line login = row["login"] #None #row[1].strip() if row["type"] == None: user_type = "" else: user_type = str(row["type"]) #None if row["location"] == None: location = "" else: location = str(row["location"]) #None # try: # name = line.split('<')[0].strip() # email = line.split('<')[1].strip().split('>')[0].strip() # except: # print line # exit() unmask[uid] = uid if row["record_type"] == 1: record_type = USR_REAL else: record_type = USR_FAKE # m = fakeusr_rex.search(login) # if m is not None: # record_type = USR_FAKE # else: # record_type = USR_REAL a = Alias(record_type, uid, login, name, email, location, user_type) aliases[uid] = a # - email d_uid_email[a.uid] = a.email if a.email is not None: d_email_uid.setdefault(a.email, set([a.uid])) d_email_uid[a.email].add(a.uid) # - prefix d_uid_prefix[a.uid] = a.email_prefix d_uid_comp_prefix[a.uid] = a.email_prefix if a.email_prefix is not None: if len(a.email_prefix.split('.')) > 1 or len( a.email_prefix.split('_')) > 1: d_comp_prefix_uid.setdefault(a.email_prefix, set([a.uid])) d_comp_prefix_uid[a.email_prefix].add(a.uid) else: d_prefix_uid.setdefault(a.email_prefix, set([a.uid])) d_prefix_uid[a.email_prefix].add(a.uid) # - domain d_uid_domain[a.uid] = a.email_domain if a.email_domain is not None: d_domain_uid.setdefault(a.email_domain, set([a.uid])) d_domain_uid[a.email_domain].add(a.uid) # - login d_uid_login[a.uid] = a.login if a.login is not None: d_login_uid.setdefault(a.login, set([a.uid])) d_login_uid[a.login].add(a.uid) if a.record_type == USR_REAL: d_login_uid.setdefault(a.login.lower(), set([a.uid])) d_login_uid[a.login.lower()].add(a.uid) # type d_uid_type[a.uid] = a.usr_type # - name d_uid_name[a.uid] = a.name if a.name is not None and len(a.name): d_name_uid.setdefault(a.name, set([a.uid])) d_name_uid[a.name].add(a.uid) if len(a.name.split(' ')) == 1: d_name_uid.setdefault(a.name.lower(), set([a.uid])) d_name_uid[a.name.lower()].add(a.uid) # janejohnson -> janejohnson # we need this for matching d_name_app_uid.setdefault(a.name.lower(), set([a.uid])) d_name_app_uid[a.name.lower()].add(a.uid) # jane johnson -> janejohnson d_name_app_uid.setdefault("".join(a.name.split(" ")).lower(), set([a.uid])) d_name_app_uid["".join(a.name.split(" ")).lower()].add(a.uid) if "@" in a.name: # otherwise it will make "gmail", "com" as names name_subpart = a.name.split("@")[0] d_name_parts_uid.setdefault(name_subpart.lower(), set([a.uid])) d_name_parts_uid[name_subpart.lower()].add(a.uid) else: # xiyi ji -> ji xiyi name_parts_split = a.name.lower().replace(",", " ").replace( ".", " ").split(' ') if len(name_parts_split) != 2: continue new_name_parts = name_parts_split[-1] + " " + name_parts_split[ 0] d_name_parts_uid.setdefault(new_name_parts, set([a.uid])) d_name_parts_uid[new_name_parts].add(a.uid) # - location d_uid_location[a.uid] = a.location if a.location is not None and len(a.location): d_location_uid.setdefault(a.location, set([a.uid])) d_location_uid[a.location].add(a.uid) # idx += 1 # if idx >= curidx: # print curidx/step # curidx += step # print 'Done: helpers' clues = {} for email, set_uid in d_email_uid.items(): if len(set_uid) > THR_MIN: for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(EMAIL) # print a,b,EMAIL # print 'Done: email' for prefix, set_uid in d_comp_prefix_uid.items(): if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: if len(prefix) >= 3: for a, b in combinations( sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(COMP_EMAIL_PREFIX) # print a,b,COMP_EMAIL_PREFIX # print 'Done: comp email prefix' for prefix, set_uid in d_prefix_uid.items(): if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: if len(prefix) >= 3: for a, b in combinations( sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(SIMPLE_EMAIL_PREFIX) # print a,b,SIMPLE_EMAIL_PREFIX # print 'Done: email prefix' for prefix in set(d_prefix_uid.keys()).intersection(set( d_login_uid.keys())): if len(d_prefix_uid[prefix]) < THR_MAX: for a, b in product( sorted(d_login_uid[prefix], key=lambda uid: int(uid)), sorted(d_prefix_uid[prefix], key=lambda uid: int(uid))): if a < b: clues.setdefault((a, b), []) if not SIMPLE_EMAIL_PREFIX in clues[(a, b)]: clues[(a, b)].append(PREFIX_LOGIN) # print a,b,PREFIX_LOGIN # print 'Done: prefix=login' for prefix in set(d_prefix_uid.keys()).intersection(set( d_name_uid.keys())): if len(d_prefix_uid[prefix]) < THR_MAX and len( d_name_uid[prefix]) < THR_MAX: for a, b in product( sorted(d_name_uid[prefix], key=lambda uid: int(uid)), sorted(d_prefix_uid[prefix], key=lambda uid: int(uid))): if a < b: clues.setdefault((a, b), []) if not SIMPLE_EMAIL_PREFIX in clues[(a, b)]: clues[(a, b)].append(PREFIX_NAME) # print 'Done: prefix=name' for prefix in set(d_login_uid.keys()).intersection(set(d_name_uid.keys())): if len(d_name_uid[prefix]) < THR_MAX: for a, b in product( sorted(d_name_uid[prefix], key=lambda uid: int(uid)), sorted(d_login_uid[prefix], key=lambda uid: int(uid))): if a < b: clues.setdefault((a, b), []) if not SIMPLE_EMAIL_PREFIX in clues[(a, b)]: clues[(a, b)].append(LOGIN_NAME) # print 'Done: login=name' for name, set_uid in d_name_uid.items(): if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: if len(name.split(' ')) > 1: for a, b in combinations( sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(FULL_NAME) else: for a, b in combinations( sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(SIMPLE_NAME) # print 'Done: full/simple name' for name, set_uid in d_name_parts_uid.items(): #out.write(name + "," + str(set_uid) + "\n") if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(NAME_PARTS) #out.write("\n") # print 'Done: name parts' for name, set_uid in d_name_app_uid.items(): #out.write(name + "," + str(set_uid)) if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(NAME_APPENDED) # print 'Done: name parts appended' for domain, set_uid in d_domain_uid.items(): if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)), 2): clues.setdefault((a, b), []) clues[(a, b)].append(DOMAIN) # print 'Done: email domain' for location, set_uid in d_location_uid.items(): if len(set_uid) > THR_MIN: for a, b in combinations(sorted(set_uid, key=lambda uid: int(uid)), 2): na = d_uid_name[a] nb = d_uid_name[b] if na is not None and nb is not None and len( na.split()) > 1 and na == nb: if len(d_name_uid.get(na, set([]))) < THR_MAX: clues.setdefault((a, b), []) clues[(a, b)].append(LOCATION) # print 'Done: location' d_alias_map = {} clusters = {} labels = {} def merge(a, b, rule): # Contract: a < b assert a < b, "A must be less than B" if a in d_alias_map: if b in d_alias_map: if d_alias_map[a] == d_alias_map[b]: labels[d_alias_map[a]].append(rule) else: lowest = min(d_alias_map[a], d_alias_map[b]) highest = max(d_alias_map[a], d_alias_map[b]) labels[lowest].extend(labels[highest]) labels[lowest].append(rule) clusters[lowest].update(clusters[highest]) for x in clusters[highest]: d_alias_map[x] = lowest del labels[highest] del clusters[highest] d_alias_map[a] = lowest d_alias_map[b] = lowest else: # a is an alias; first time I see b d_alias_map[b] = d_alias_map[a] clusters[d_alias_map[a]].add(b) labels[d_alias_map[a]].append(rule) else: if b in d_alias_map: #b_src = d_alias_map[b] # b_src < a by construction d_alias_map[a] = d_alias_map[b] clusters[d_alias_map[b]].add(a) labels[d_alias_map[b]].append(rule) else: # First time I see this pair (guaranteed sorted) d_alias_map[a] = a d_alias_map[b] = a clusters[a] = set([a, b]) labels[a] = [rule] for (a, b), list_clues in sorted(clues.items(), key=lambda e: (int(e[0][0]), int(e[0][1]))): if print_flag: print(((a, b), list_clues)) aa = aliases[a] ab = aliases[b] if EMAIL in list_clues: merge(a, b, EMAIL) elif len(set(list_clues)) >= 2: for clue in set(list_clues): merge(a, b, clue) # merge(a,b,TWO) elif FULL_NAME in list_clues: merge(a, b, FULL_NAME) elif NAME_APPENDED in list_clues: merge(a, b, NAME_APPENDED) elif NAME_PARTS in list_clues: merge(a, b, NAME_PARTS) elif COMP_EMAIL_PREFIX in list_clues: merge(a, b, COMP_EMAIL_PREFIX) elif SIMPLE_NAME in list_clues: merge(a, b, SIMPLE_NAME) elif PREFIX_NAME in list_clues: merge(a, b, PREFIX_NAME) # print 'Done: clusters' for uid, member_uids in clusters.items(): # print ((uid, member_uids)) members = [aliases[m] for m in member_uids] # Count fake/real c = Counter([m.record_type for m in members]) real = [m for m in members if m.record_type == USR_REAL] with_location = [m for m in real if m.location is not None] fake = [m for m in members if m.record_type == USR_FAKE] # Count rules that fired cl = Counter(labels[uid]) if print_flag: print(cl) is_valid = False # If all have the same email there is no doubt if cl.get(EMAIL, 0) >= (len(members) - 1): is_valid = True # If all the REALs have the same email, assume all the FAKEs are this REAL elif len(Counter([m.email for m in real]).keys()) == 1: is_valid = True # If there is at most one real, at least two rules fired, and each rule applied to each pair elif len(cl.keys()) > 1 and min(cl.values()) >= (len(members) - 1): is_valid = True # At most one real, the only rule that fired is COMP_EMAIL_PREFIX or FULL_NAME elif len(cl.keys()) == 1 and \ (cl.get(COMP_EMAIL_PREFIX,0) or cl.get(FULL_NAME,0) or \ cl.get(NAME_PARTS,0) or cl.get(NAME_APPENDED,0)): is_valid = True # All with same full name and location / same full name and email domain elif cl.get(FULL_NAME,0) >= (len(members)-1) and \ (cl.get(LOCATION,0) >= (len(members)-1) or cl.get(DOMAIN,0) >= (len(members)-1)): is_valid = True # All same composite email prefix / same full name elif (cl.get(COMP_EMAIL_PREFIX, 0) >= (len(members) - 1) or cl.get(FULL_NAME, 0) >= (len(members) - 1)): is_valid = True elif cl.get(NAME_APPENDED, 0) >= (len(members) - 1): is_valid = True elif cl.get(FULL_NAME, 0) >= (len(members) - 1): is_valid = True # The only two rules that fired are full name and email, in some combination elif len(cl.keys()) == 2 and cl.get(FULL_NAME, 0) > 0 and cl.get( EMAIL, 0) > 0: is_valid = True elif len(cl.keys()) == 3 and cl.get(FULL_NAME, 0) > 0 and cl.get( EMAIL, 0) > 0 and cl.get(SIMPLE_NAME, 0) > 0: is_valid = True elif len(cl.keys()) == 2 and cl.get(EMAIL, 0) > 0 and cl.get( SIMPLE_NAME, 0) > 0: is_valid = True elif cl.get(PREFIX_NAME, 0) > 0: is_valid = True elif cl.get(SIMPLE_NAME,0) > 0 and cl.get(FULL_NAME,0) > 0 \ and cl.get(SIMPLE_EMAIL_PREFIX,0) > 0 and cl.get(EMAIL,0) > 0: is_valid = True elif cl.get(SIMPLE_NAME, 0) > 0: is_valid = True elif cl.get(NAME_PARTS, 0) >= (len(members) - 1): is_valid = True else: # is_valid = True # continue # Split by email address if at least 2 share one if cl.get(EMAIL, 0): ce = [ e for e, c in Counter([m.email for m in members]).items() if c > 1 ] for e in ce: extra_members = [m for m in members if m.email == e] extra_real = [ m for m in extra_members if m.record_type == USR_REAL ] extra_with_location = [ m for m in extra_real if m.location is not None ] if len(extra_real): if len(extra_with_location): # Pick the one with the oldest account with location, if available rep = sorted(extra_with_location, key=lambda m: int(m.uid))[0] else: # Otherwise pick the one with the oldest account rep = sorted(extra_real, key=lambda m: int(m.uid))[0] else: rep = sorted(extra_members, key=lambda m: int(m.uid))[0] # w_log.writerow([]) # w_log.writerow([rep.uid, rep.login, rep.name, rep.email, rep.location]) for a in extra_members: if a.uid != rep.uid: # w_log.writerow([a.uid, a.login, a.name, a.email, a.location]) # writer.writerow([a.uid, rep.uid]) unmask[a.uid] = rep.uid # print ('Mapped:' + str((a.uid, rep.uid))) # w_maybe.writerow([]) # w_maybe.writerow([str(cl.items())]) if print_flag: print(str(cl.items())) for m in members: print([m.uid, m.name, m.email]) # w_maybe.writerow([m.uid, m.login, m.name, m.email, m.location]) if is_valid: # Determine group representative if len(real): if len(with_location): # Pick the one with the oldest account with location, if available rep = sorted(with_location, key=lambda m: int(m.uid))[0] else: # Otherwise pick the one with the oldest account rep = sorted(real, key=lambda m: int(m.uid))[0] else: rep = sorted(members, key=lambda m: int(m.uid))[0] # w_log.writerow([]) # w_log.writerow([str(cl.items())]) # w_log.writerow([rep.uid, rep.login, rep.name, rep.email, rep.location]) for a in members: if a.uid != rep.uid: # w_log.writerow([a.uid, a.login, a.name, a.email, a.location]) # writer.writerow([a.uid, rep.uid]) unmask[a.uid] = rep.uid if print_flag: print('Mapped:' + str((a.uid, rep.uid))) return unmask
def main (args, app): new_alias = Alias(name=args.name, command=args.command) new_alias.store(app.config) app.config.save( ) print "added", new_alias.format_url( )
def add_alias(self, new_cpp_type_name, old_cpp_type_name): try: direct_new_cpp_global_expr = self.cpp_type_expr_parser.parse( new_cpp_type_name).prefix(self.components) direct_old_cpp_global_expr = self.resolve_cpp_type_expr( old_cpp_type_name) self.type_mgr.add_alias(direct_new_cpp_global_expr, direct_old_cpp_global_expr) direct_new_kl_local_name = new_cpp_type_name direct_new_kl_global_name = '_'.join(self.nested_kl_names + [direct_new_kl_local_name]) direct_old_dqti = self.type_mgr.get_dqti( direct_old_cpp_global_expr) print "direct_old_dqti.type_info.kl.name = " + str( direct_old_dqti.type_info.kl.name) print "direct_old_dqti.type_info.edk.name = " + str( direct_old_dqti.type_info.edk.name) print "direct_old_dqti.type_info.lib.name = " + str( direct_old_dqti.type_info.lib.name) print "direct_old_dqti.type_info.lib.expr = " + str( direct_old_dqti.type_info.lib.expr) direct_alias = Alias(self, direct_new_kl_global_name, direct_old_dqti.type_info) self.ext.add_decl(direct_alias) const_ptr_new_cpp_type_expr = PointerTo( Const(direct_new_cpp_global_expr)) const_ptr_old_cpp_type_expr = PointerTo( Const(direct_old_cpp_global_expr)) self.type_mgr.add_alias(const_ptr_new_cpp_type_expr, const_ptr_old_cpp_type_expr) const_ptr_new_kl_type_name = direct_new_kl_global_name + "_CxxConstPtr" const_ptr_old_dqti = self.type_mgr.get_dqti( const_ptr_old_cpp_type_expr) const_ptr_old_kl_type_name = const_ptr_old_dqti.type_info.kl.name.compound const_ptr_alias = Alias(self, const_ptr_new_kl_type_name, const_ptr_old_dqti.type_info) self.ext.add_decl(const_ptr_alias) self.ext.add_kl_epilog(""" %s Make_%s(%s value) { return Make_%s(value); } %s Make_%s(io %s value) { return Make_%s(value); } """ % ( const_ptr_new_kl_type_name, const_ptr_new_kl_type_name, direct_new_kl_global_name, const_ptr_old_kl_type_name, const_ptr_new_kl_type_name, const_ptr_new_kl_type_name, direct_new_kl_global_name, const_ptr_old_kl_type_name, )) mutable_ptr_new_cpp_type_expr = PointerTo( direct_new_cpp_global_expr) mutable_ptr_old_cpp_type_expr = PointerTo( direct_old_cpp_global_expr) self.type_mgr.add_alias(mutable_ptr_new_cpp_type_expr, mutable_ptr_old_cpp_type_expr) mutable_ptr_new_kl_type_name = direct_new_kl_global_name + "_CxxPtr" mutable_ptr_old_dqti = self.type_mgr.get_dqti( mutable_ptr_old_cpp_type_expr) mutable_ptr_old_kl_type_name = mutable_ptr_old_dqti.type_info.kl.name.compound mutable_ptr_alias = Alias(self, mutable_ptr_new_kl_type_name, mutable_ptr_old_dqti.type_info) self.ext.add_decl(mutable_ptr_alias) self.ext.add_kl_epilog(""" %s Make_%s(%s value) { return Make_%s(value); } %s Make_%s(io %s value) { return Make_%s(value); } """ % ( mutable_ptr_new_kl_type_name, mutable_ptr_new_kl_type_name, direct_new_kl_global_name, mutable_ptr_old_kl_type_name, mutable_ptr_new_kl_type_name, mutable_ptr_new_kl_type_name, direct_new_kl_global_name, mutable_ptr_old_kl_type_name, )) const_ref_new_cpp_type_expr = ReferenceTo( Const(direct_new_cpp_global_expr)) const_ref_old_cpp_type_expr = ReferenceTo( Const(direct_old_cpp_global_expr)) self.type_mgr.add_alias(const_ref_new_cpp_type_expr, const_ref_old_cpp_type_expr) const_ref_new_kl_type_name = direct_new_kl_global_name + "_CxxConstRef" const_ref_old_dqti = self.type_mgr.get_dqti( const_ref_old_cpp_type_expr) const_ref_old_kl_type_name = const_ref_old_dqti.type_info.kl.name.compound const_ref_alias = Alias(self, const_ref_new_kl_type_name, const_ref_old_dqti.type_info) self.ext.add_decl(const_ref_alias) self.ext.add_kl_epilog(""" %s Make_%s(%s value) { return Make_%s(value); } %s Make_%s(io %s value) { return Make_%s(value); } """ % ( const_ref_new_kl_type_name, const_ref_new_kl_type_name, direct_new_kl_global_name, const_ref_old_kl_type_name, const_ref_new_kl_type_name, const_ref_new_kl_type_name, direct_new_kl_global_name, const_ref_old_kl_type_name, )) mutable_ref_new_cpp_type_expr = ReferenceTo( direct_new_cpp_global_expr) mutable_ref_old_cpp_type_expr = ReferenceTo( direct_old_cpp_global_expr) self.type_mgr.add_alias(mutable_ref_new_cpp_type_expr, mutable_ref_old_cpp_type_expr) mutable_ref_new_kl_type_name = direct_new_kl_global_name + "_CxxRef" mutable_ref_old_dqti = self.type_mgr.get_dqti( mutable_ref_old_cpp_type_expr) mutable_ref_old_kl_type_name = mutable_ref_old_dqti.type_info.kl.name.compound mutable_ref_alias = Alias(self, mutable_ref_new_kl_type_name, mutable_ref_old_dqti.type_info) self.ext.add_decl(mutable_ref_alias) self.ext.add_kl_epilog(""" %s Make_%s(%s value) { return Make_%s(value); } %s Make_%s(io %s value) { return Make_%s(value); } """ % ( mutable_ref_new_kl_type_name, mutable_ref_new_kl_type_name, direct_new_kl_global_name, mutable_ref_old_kl_type_name, mutable_ref_new_kl_type_name, mutable_ref_new_kl_type_name, direct_new_kl_global_name, mutable_ref_old_kl_type_name, )) return direct_alias except Exception as e: self.ext.warning("Ignoring alias '%s': %s" % (new_cpp_type_name, e)) return EmptyCommentContainer()
def main(args, app): for device in Alias.FromConfig(app.config): if args.name in ['*', device.name]: print(args.format(device))
def main(input_dir_path: str, out_dir_path: str): log.info("Input dir: %s; out_dir: %s", input_dir_path, out_dir_path) try: out_dir = os.path.abspath(out_dir_path) except IndexError: out_dir = os.path.abspath('./') out_dir = os.path.join(out_dir, 'idm') os.makedirs(out_dir, exist_ok=True) os.makedirs(os.path.join(out_dir, 'dict'), exist_ok=True) fakeusr_rex = regex.compile(r'\A[A-Z]{8}$') unmask = {} w_log = CsvWriter(csv_file=os.path.join(out_dir, 'idm_log.csv')) writer = CsvWriter(csv_file=os.path.join(out_dir, 'idm_map.csv')) w_maybe = CsvWriter(csv_file=os.path.join(out_dir, 'idm_maybe.csv')) idx = 0 step = 100000 curidx = step aliases = {} # Helper structures d_email_uid = {} d_uid_email = {} d_prefix_uid = {} d_uid_prefix = {} d_comp_prefix_uid = {} d_uid_comp_prefix = {} d_uid_domain = {} d_domain_uid = {} d_name_uid = {} d_uid_name = {} d_login_uid = {} d_uid_login = {} #df = pd.read_csv(input_dir_path, index_col=False, na_filter=False) df = utility.read_from_folder(input_dir_path, "*contributors.csv") users = [ SzzContributor(getattr(row, "CONTRIBUTOR_ID"), getattr(row, "NAME"), getattr(row, "EMAIL")) for row in df.itertuples(index=False) ] log.info("Users to parse: %d", len(users)) for user in users: uid = user.id login = user.name name = user.name email = user.email if name is "github" and email is "*****@*****.**": continue unmask[uid] = uid m = fakeusr_rex.search(login) if m is not None: record_type = USR_FAKE else: record_type = USR_REAL # a = Alias(record_type, uid, login, name, email, location, user_type) a = Alias(record_type, uid, login, name, email) aliases[uid] = a # - email d_uid_email[a.uid] = a.email if a.email is not None: d_email_uid.setdefault(a.email, {a.uid}) d_email_uid[a.email].add(a.uid) # - prefix d_uid_prefix[a.uid] = a.email_prefix d_uid_comp_prefix[a.uid] = a.email_prefix if a.email_prefix is not None: if len(a.email_prefix.split('.')) > 1 or len( a.email_prefix.split('_')) > 1: d_comp_prefix_uid.setdefault(a.email_prefix, {a.uid}) d_comp_prefix_uid[a.email_prefix].add(a.uid) else: d_prefix_uid.setdefault(a.email_prefix, {a.uid}) d_prefix_uid[a.email_prefix].add(a.uid) # - domain d_uid_domain[a.uid] = a.email_domain if a.email_domain is not None: d_domain_uid.setdefault(a.email_domain, {a.uid}) d_domain_uid[a.email_domain].add(a.uid) # - login d_uid_login[a.uid] = a.login if a.login is not None: d_login_uid.setdefault(a.login, set([a.uid])) d_login_uid[a.login].add(a.uid) if a.record_type == USR_REAL: d_login_uid.setdefault(a.login.lower(), set([a.uid])) d_login_uid[a.login.lower()].add(a.uid) # - name d_uid_name[a.uid] = a.name if a.name is not None and len(a.name): d_name_uid.setdefault(a.name, {a.uid}) d_name_uid[a.name].add(a.uid) if len(a.name.split(' ')) == 1: d_name_uid.setdefault(a.name.lower(), {a.uid}) d_name_uid[a.name.lower()].add(a.uid) idx += 1 if idx >= curidx: log.info(curidx / step, '/ 30') curidx += step log.info('Done: helpers') clues = {} for email, set_uid in d_email_uid.items(): if len(set_uid) > THR_MIN: for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2): clues.setdefault((a, b), []) clues[(a, b)].append(EMAIL) log.info('Done: email') for prefix, set_uid in d_comp_prefix_uid.items(): if THR_MIN < len(set_uid) < THR_MAX: if len(prefix) >= 3: for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2): clues.setdefault((a, b), []) clues[(a, b)].append(COMP_EMAIL_PREFIX) log.info('Done: comp email prefix') for prefix, set_uid in d_prefix_uid.items(): if THR_MIN < len(set_uid) < THR_MAX: if len(prefix) >= 3: for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2): clues.setdefault((a, b), []) clues[(a, b)].append(SIMPLE_EMAIL_PREFIX) log.info('Done: email prefix') for prefix in set(d_prefix_uid.keys()).intersection(set( d_login_uid.keys())): if len(d_prefix_uid[prefix]) < THR_MAX: for a, b in product( sorted(d_login_uid[prefix], key=lambda uid: uid), sorted(d_prefix_uid[prefix], key=lambda uid: uid)): if a < b: clues.setdefault((a, b), []) if SIMPLE_EMAIL_PREFIX not in clues[(a, b)]: clues[(a, b)].append(PREFIX_LOGIN) log.info('Done: prefix=login') for prefix in set(d_prefix_uid.keys()).intersection(set( d_name_uid.keys())): if len(d_prefix_uid[prefix]) < THR_MAX and len( d_name_uid[prefix]) < THR_MAX: for a, b in product( sorted(d_name_uid[prefix], key=lambda uid: uid), sorted(d_prefix_uid[prefix], key=lambda uid: uid)): if a < b: clues.setdefault((a, b), []) if SIMPLE_EMAIL_PREFIX not in clues[(a, b)]: clues[(a, b)].append(PREFIX_NAME) log.info('Done: prefix=name') for prefix in set(d_login_uid.keys()).intersection(set(d_name_uid.keys())): if len(d_name_uid[prefix]) < THR_MAX: for a, b in product( sorted(d_name_uid[prefix], key=lambda uid: uid), sorted(d_login_uid[prefix], key=lambda uid: uid)): if a < b: clues.setdefault((a, b), []) if SIMPLE_EMAIL_PREFIX not in clues[(a, b)]: clues[(a, b)].append(LOGIN_NAME) log.info('Done: login=name') # print d_name_uid.items() for name, set_uid in d_name_uid.items(): if len(set_uid) > THR_MIN and len(set_uid) < THR_MAX: if len(name.split(' ')) > 1: for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2): clues.setdefault((a, b), []) clues[(a, b)].append(FULL_NAME) # print a,b,FULL_NAME else: for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2): clues.setdefault((a, b), []) clues[(a, b)].append(SIMPLE_NAME) log.info('Done: full/simple name') for domain, set_uid in d_domain_uid.items(): if THR_MIN < len(set_uid) < THR_MAX: for a, b in combinations(sorted(set_uid, key=lambda uid: uid), 2): clues.setdefault((a, b), []) clues[(a, b)].append(DOMAIN) log.info('Done: email domain') for (a, b), list_clues in sorted(clues.items(), key=lambda e: (e[0][0], e[0][1])): if EMAIL in list_clues: merge(a, b, EMAIL) elif len(list_clues) >= 2: for clue in list_clues: merge(a, b, clue) elif FULL_NAME in list_clues: merge(a, b, FULL_NAME) elif COMP_EMAIL_PREFIX in list_clues: merge(a, b, COMP_EMAIL_PREFIX) log.info('Done: clusters') for uid, member_uids in clusters.items(): members = [aliases[m] for m in member_uids] # Count fake/real real = [m for m in members if m.record_type == USR_REAL] # with_location = [m for m in real if m.location is not None] # Count rules that fired cl = Counter(labels[uid]) is_valid = False # If all have the same email there is no doubt if cl.get(EMAIL, 0) >= (len(members) - 1): is_valid = True # If all the REALs have the same email, assume all the FAKEs are this REAL elif len(Counter([m.email for m in real]).keys()) == 1: is_valid = True # If there is at most one real, at least two rules fired, and each rule applied to each pair elif len(real) <= 1 and len(cl.keys()) > 1 and min( cl.values()) >= (len(members) - 1): is_valid = True # At most one real, the only rule that fired is COMP_EMAIL_PREFIX or FULL_NAME elif len(real) <= 1 and len(cl.keys()) == 1 and \ (cl.get(COMP_EMAIL_PREFIX, 0) or cl.get(FULL_NAME, 0)): is_valid = True # All with same full name and location / same full name and email domain elif cl.get(FULL_NAME, 0) >= (len(members) - 1) and \ (cl.get(LOCATION, 0) >= (len(members) - 1) or cl.get(DOMAIN, 0) >= (len(members) - 1)): is_valid = True # All fake and same composite email prefix / same full name elif len(real) == 0 and \ (cl.get(COMP_EMAIL_PREFIX, 0) >= (len(members) - 1) or cl.get(FULL_NAME, 0) >= (len(members) - 1)): is_valid = True else: # Split by email address if at least 2 share one if cl.get(EMAIL, 0): ce = [ e for e, c in Counter([m.email for m in members]).items() if c > 1 ] for e in ce: extra_members = [m for m in members if m.email == e] # extra_with_location = [m for m in extra_real if m.location is not None] # if len(extra_real): # if len(extra_with_location): # # Pick the one with the oldest account with location, if available # rep = sorted(extra_with_location, key=lambda m: int(m.uid))[0] # else: # # Otherwise pick the one with the oldest account # rep = sorted(extra_real, key=lambda m: int(m.uid))[0] # else: rep = sorted(extra_members, key=lambda m: m.uid)[0] w_log.writerow([]) # w_log.writerow([rep.uid, rep.login, rep.name, rep.email, rep.location]) w_log.writerow([rep.uid, rep.name, rep.email]) for a in extra_members: if a.uid != rep.uid: # w_log.writerow([a.uid, a.login, a.name, a.email, a.location]) w_log.writerow([a.uid, a.name, a.email]) writer.writerow([a.uid, rep.uid]) unmask[a.uid] = rep.uid # -- added: Write also maybes to the alias map rep = sorted(members, key=lambda m: m.uid)[0] # -- end w_maybe.writerow([]) w_maybe.writerow([str(cl.items())]) for m in members: # -- added: added Write also maybes to the alias map if m.uid != rep.uid: unmask[m.uid] = rep.uid writer.writerow([m.uid, rep.uid]) # -- end # w_maybe.writerow([m.uid, m.login, m.name, m.email, m.location]) w_maybe.writerow([m.uid, m.name, m.email]) if is_valid: # Determine group representative # if len(real): # if len(with_location): # # Pick the one with the oldest account with location, if available # rep = sorted(with_location, key=lambda m: int(m.uid))[0] # else: # # Otherwise pick the one with the oldest account # rep = sorted(real, key=lambda m: int(m.uid))[0] # else: rep = sorted(members, key=lambda m: m.uid)[0] w_log.writerow([]) w_log.writerow([str(cl.items())]) # w_log.writerow([rep.uid, rep.login, rep.name, rep.email, rep.location]) w_log.writerow([rep.uid, rep.name, rep.email]) for a in members: if a.uid != rep.uid: # w_log.writerow([a.uid, a.login, a.name, a.email, a.location]) w_log.writerow([a.uid, a.name, a.email]) writer.writerow([a.uid, rep.uid]) unmask[a.uid] = rep.uid log.info("Unmasked size: %d", len(unmask)) pickle.dump(unmask, open(os.path.join(out_dir, 'dict', 'aliasMap.dict'), 'wb'))
class Compile: ''' This class creates a compiler for the DDStorm that compiles the text files containing list of differential diagnosis to simplified modular data files usable by the program. ''' def __init__(self, conf=False): ''' The constructor optionally accepts a configuration. If none is provided it creates a default configuration. Parameters: conf - A dictionary containing configuration options ''' if (conf): self._conf = conf else: self._conf = Conf() self.clean = True def compile(self): ''' Compile the text files to DDStorm modules. ''' self.source = set() self.custom = set() self.alias = Alias(self._conf) # Loop over library files and add *.txt files to source for path, subdirs, files in os.walk(self._conf.get("library_path")): for name in files: if (fnmatch(name, "*.txt")): self.source.add(os.path.join(path, name)) # Loop over custom files and add *.txt files to custom for path, subdirs, files in os.walk(self._conf.get("custom_path")): for name in files: if (fnmatch(name, "*.txt")): self.custom.add(os.path.join(path, name)) # Create module directory if not already present and delete all module files if (not os.path.isdir(self._conf.get("module_path"))): os.makedirs(self._conf.get("module_path")) for f in os.listdir(self._conf.get("module_path")): if (fnmatch(f, "*.module")): os.unlink(self._conf.get("module_path") + f) # Create a regex for calculating priority from filename self.priorityRegex = re.compile("(?<=\.)\d+$") # First sort files by priority then compile them to module for src in self._sortPriority(self.source): self._makeModule(src) for src in self._sortPriority(self.custom): self._makeModule(src) def _sortPriority(self, files): ''' Sort data files based on their priority settings. ''' ls = [] # Loop over the files for addr in files: # Format the file name name = os.path.splitext(os.path.basename(addr))[0].lower().replace( "_", " ").replace("-", " ") # Search for priority tag on file name m = re.search(self.priorityRegex, name) # Add to ls as (symptom name, priority number, file name) with default priority of 100 if (m): ls.append((name.replace("." + m.group(), ""), int(m.group()), addr)) else: ls.append((name, 100, addr)) # Sort the file list, first by the symptom name, then by the priority number ls.sort(reverse=True) if (ls): return (list(zip(*ls))[2]) else: return ls def _makeModule(self, src): ''' Create application usable modules from data files. ''' # Format the file name module = os.path.splitext(os.path.basename(src))[0].lower().replace( "_", " ").replace("-", " ") # Remove the priority tag from file name m = re.search(self.priorityRegex, module) if (m): module = module.replace("." + m.group(), "") # Create the module file name modFile = self._conf.get("module_path") + module + ".module" modFlag = False # Loop over both files, the source data file and the target module file with open(src, "r") as sf, open(modFile, "a") as tf: # Ignore lines starting with ! or #, + and - has special meaning, write other lines to module. Log the errors. for line in sf: line = line.strip().split("#")[0] if (len(line) == 0): pass elif (line.startswith("!")): pass elif (line.startswith("#")): pass elif (line.startswith("+")): modFlag = True elif (line.startswith("-")): modFlag = True elif (line.replace(" ", "").replace("-", "").replace( "_", "").replace("'", "").isalnum()): print(self.alias.get(line).capitalize(), file=tf) else: self.clean = False logging.warning("Syntax error in file '" + src + "': " + line) # Deal with special lines if (modFlag): modFlag = False with open(src, "r") as f: for line in f: line = line.strip().split("#")[0] if (line[1:].replace(" ", "").replace("-", "").replace( "_", "").replace("'", "").isalnum()): # If line starts with + add it to the module file if (line.startswith("+")): with open(modFile, "r") as fn: text = fn.read() with open(modFile, "w") as fn: print(self.alias.get(line[1:]).capitalize() + "\n" + text, file=fn) # If line starts with - remove corresponding item from the module file elif (line.startswith("-")): with open(modFile, "r") as fn: text = fn.read() text = text.replace( self.alias.get(line[1:]).capitalize() + "\n", "") with open(modFile, "w") as fn: print(text, file=fn) def is_clean(self): '''Report if compilation ended successfully''' return self.clean