def Match(self, text): #tokeniz and normalize our text textArr = tokenize.wordpunct_tokenize(text.lower().strip()) hits = 0 results = [] secondary = [] #-tlength as we need to iterate over window size of words for ti in xrange(0,len(textArr)-self.tlength): for termT in self.toMatch: #so whats the distance between our first token? dist1 = editdist.distance(textArr[ti],termT[hits]) if dist1 <= self.thresh: if len(termT) <= 1: print "got hit with %s"%termT results.append(termT[hits]) else: dist2 = editdist.distance(textArr[ti+1],termT[hits+1]) print "distance between %s and %s is %s" %(textArr[ti+1],termT[hits+1],dist2) #WARNING: this will only work for 2-grams where the tlength is an n-gram. if dist2 <= self.thresh: #we have a close hit lets check if the second term in tuple is a hit as well. #hits = hits + 1 results.append("%s %s"%(termT[hits],termT[hits+1])) #print termT #print "got hit on term %s"%results #looks like we've found a match #print secondary #we're done shit.... return results
def distance(a, b): """find best edit distance between two strings of potentially uneven length. """ la, lb = len(a), len(b) assert isinstance(a, basestring), isinstance(b, basestring) if la < lb: return distance(b, a) if la == lb: return ed.distance(a, b) else: dists = [] for i in xrange(0, la-lb+1): dists.append(ed.distance(a[i:i+lb], b)) return min(dists)
def filter_words(self, words): num = Settings.get('str_extra') what = Settings.get('str_what') if what == 'r': # random pass else: control = self.get_list() if not control: return if what == 'e': # encompassing stream = [(sum([x.count(c) for c in control]), x) for x in words] #print "str:", list(stream)[0:10] preres = list(itertools.islice(filter(lambda x: x[0] > 0, stream), 4*num)) #print "pre:", preres preres.sort(key=lambda x: x[0], reverse=True) words = [x[1] for x in preres] else: # similar words = filter(lambda x: 0 < min( editdist.distance( x.encode('latin1', 'replace'), y.encode('latin1', 'replace'))/max(len(y), len(x)) for y in control) < .26, words) if Settings.get('str_clear') == 'r': # replace = clear GtkUtil.textbuf_clear(self.buf()) self.add_list(itertools.islice(words, num))
def filterWords(self, words): n = Settings.get('str_extra') w = Settings.get('str_what') if w == 'r': # random pass else: control = self.getList() if len(control) == 0: return if w == 'e': # encompassing stream = map(lambda x: (sum([x.count(c) for c in control]), x), words) print "str:", list(stream)[0:10] preres = list(islice(ifilter(lambda x: x[0] > 0, stream), 4*n)) print "pre:", preres preres.sort(key=lambda x: x[0], reverse=True) words = map(lambda x: x[1], preres) else: # similar words = ifilter(lambda x: 0 < min([ editdist.distance(x.encode('latin1', 'replace'), y.encode('latin1', 'replace'))/max(len(y), len(x)) for y in control]) < .26, words) if Settings.get('str_clear') == 'r': # replace = clear self.clear() self.addList(islice(words, n))
def matchtitle(self, gtitle, mtitle): short_key = mtitle.lower() key_title = gtitle.lower() exactmatched = False if short_key == key_title: exactmatched = True if not exactmatched: #if can not be critical matched, try by calculate Levenshtein distance x = '' y = '' for ch in short_key: if ch <= chr(127): x = x + ch for ch in key_title: if ch <= chr(127): y = y + ch short_key = x key_title = y ed = editdist.distance(short_key, key_title) print ed if ed < 15: #adaptable exactmatched = True return exactmatched
def is_similar(query, targets, n): """Tests target set of sequences to the query. Args: query (str): query sequence targets (set): unique sequences n (int): allowable mismatches when comparing a query to a given sequence of the targets Returns: bool >>> import editdist >>> s = "ACTGA" >>> ts_1 = {"ACTGG"} >>> ts_2 = {"ACTCC", "ACTGG"} >>> ts_3 = {"ACTCC", "ACTTT"} >>> n = 1 >>> is_similar(s, ts_1, n) True >>> is_similar(s, ts_2, n) True >>> is_similar(s, ts_3, n) False """ if targets: for target in targets: if editdist.distance(target, query) <= n: return True return False
def get_from_ref_dict(L, v_in): assert False best_edit_dist = 1000000000000 best_name = None for v_key, tv in L.iteritems(): if v_key == v_in: return tv d = editdist.distance(v_in, v_key) min_l = min(len(v_in), len(v_key)) if v_in[:min_l] == v_key[:min_l]: d = 0 if d < best_edit_dist: best_edit_dist = d best_name = tv if d == 0: break print "%s >>>>>> %s" % (v_in, best_name) return best_name
def f(rec): ''' Filter function for cutsite''' cutsite = rec.seq[6: 6 + cutsite_length].tostring() cutsite_dist = ed.distance(target_cutsite, cutsite) return cutsite_dist <= mindist
def seq_matcher(name1, name2): name1 = unicode( unicodedata.normalize('NFKD', name1).encode('ascii', 'ignore'), 'utf-8') name2 = unicode(name2, 'utf-8') name2 = unicode( unicodedata.normalize('NFKD', name2).encode('ascii', 'ignore'), 'utf-8') soundex = fuzzy.Soundex(4) name1 = soundex(name1) name2 = soundex(name2) # dmeta = fuzzy.DMetaphone() # name1 = dmeta(name1)[0] # name2 = dmeta(name2)[0] # name1 = fuzzy.nysiis(name1) # name2 = fuzzy.nysiis(name2) m = SequenceMatcher(None, name1, name2) # Calculate an edit distance"abcef" # print 'm',m.ratio() e = editdist.distance(name1, name2) # print 'e',e sm = StringMatcher(seq1=name1, seq2=name2) # return e # print sm.distance() return sm.distance()
def train(self, lemma2tokens={}): """ Train the align by pairing each pair of tokens under a given lemma, which have an edit dist to each other < maximum alignment distance (i.e. also identical tokens are aligned to not over-abstract) """ # declare containers for the alignment vectors: self.train_feature_dicts = [] self.train_labels = [] for lemma in lemma2tokens: tokens = lemma2tokens[lemma] for t1 in tokens: for t2 in tokens: # combine and align items inside lemma that are close enough (or identical!): if (t1 == t2) or (editdist.distance(t1, t2) < self.max_align_dist): curr_dicts, curr_labels = self.transliterator.transliterate(t1, t2) self.train_feature_dicts.extend(curr_dicts) self.train_labels.extend(curr_labels) # transform the alignment vectors dicts into sklearn format self.vectorizer = DictVectorizer() self.train_X = self.vectorizer.fit_transform(self.train_feature_dicts) self.labelEncoder = preprocessing.LabelEncoder() self.train_y = self.labelEncoder.fit_transform(self.train_labels) self.clf = SGDClassifier(loss="log", penalty="l2") print("Training the SGD classifier for the alignator...") self.clf.fit(self.train_X, self.train_y) return
def test_02__fuzz(self): for i in range(0, 32) + range(128, 1024, 128): for j in range(0, 32): a = randstring(i) b = randstring(j) dist = editdist.distance(a, b) self.assert_(dist >= 0)
def cal_distance(x1, x2, method='euclidean'): if method == 'euclidean': diff = x1 - x2 sq_diff = diff * diff return(np.sqrt(np.sum(sq_diff))) elif method == 'editdist': return(editdist.distance(x1, x2))
def cal_distance(x1, x2, method='euclidean'): if method == 'euclidean': diff = x1 - x2 sq_diff = diff * diff return (np.sqrt(np.sum(sq_diff))) elif method == 'editdist': return (editdist.distance(x1, x2))
def similarity_unordered(text1, text2): """Calculates the similarity between two short strings. This is done by sorting the tokens in the string and then calculating the Levenstein Edit Distance via http://www.mindrot.org/projects/py-editdist/ Returns a value between 0 and 1. >>> similarity_unordered('Ich bin müde', 'Ich bin müde') 1.0 >>> similarity_unordered('Ich bin müde', 'müde bin ich') 1.0 >>> similarity_unordered('Ich bin müde', 'Ich bin rüde') 0.92307692307692313 >>> similarity_unordered('Ich bin müde', 'Ich bin prüde') 0.85714285714285721 >>> similarity_unordered('Ich bin müde', 'Wenn dein starker Arm es will, stehen alle Räder still.') 0.10909090909090913 """ if not editdist: # Module not installed, default to 1 return 1.0 tokens1 = ' '.join(sorted(tokenize_text(text1))) tokens2 = ' '.join(sorted(tokenize_text(text2))) maxlen = float(max([len(tokens1), len(tokens2)])) distance = editdist.distance(tokens1.encode('utf-8'), tokens2.encode('utf-8')) divisor = max([distance, maxlen]) if not divisor: print repr(text1), repr(text1), maxlen, distance print repr(tokens1), repr(tokens2), maxlen, distance return 0 return 1 - (distance / divisor)
def main(argv): words = set() for filename in argv[1:]: with open(filename) as f: for line in f: for word in line.strip().split(' '): # TODO: remove puncts? words.add(word) words = list(words) random.shuffle(words) pos = random.randint(0, len(words) - 1) chosen_word = words[pos] words = words[:pos] + words[pos+1:] sys.stdout.write(chosen_word + ' ') while words: best_dist = 10000000000 best_pos = None for i in xrange(0, len(words)): word = words[i] dist = editdist.distance(chosen_word, word) if dist < best_dist: #print word, dist best_dist = dist best_pos = i pos = best_pos chosen_word = words[pos] words = words[:pos] + words[pos+1:] sys.stdout.write(chosen_word + ' ') sys.stdout.write('\n')
def matchtitle(self,gtitle,mtitle): short_key = mtitle.lower() key_title = gtitle.lower() exactmatched = False if short_key==key_title: exactmatched = True if not exactmatched:#if can not be critical matched, try by calculate Levenshtein distance x = '' y = '' for ch in short_key: if ch<=chr(127): x = x+ch for ch in key_title: if ch<=chr(127): y = y+ch short_key = x key_title = y ed = editdist.distance(short_key,key_title) print ed if ed < 15:#adaptable exactmatched = True return exactmatched
def apply_baseline(train_toks, train_lems, test_toks, test_lems): print("Calculating baseline") train_dict = {} for tok, lem in zip(train_toks, train_lems): if tok not in train_dict: train_dict[tok] = {} if lem not in train_dict[tok]: train_dict[tok][lem] = 0 train_dict[tok][lem] += 1 silver_lemmas = [] for test_tok, test_lem in zip(test_toks, test_lems): # shortcut: if test_tok in train_dict: k = test_tok else: candidates = train_dict.keys() distances = [(editdist.distance(test_tok, c), c) for c in candidates] k = min(distances, key=itemgetter(0))[1] silver_lem = max(train_dict[k].iteritems(), key=itemgetter(1))[0] silver_lemmas.append(silver_lem) return silver_lemmas
def pickRecurse(self, track, info): if len(info) == 0: #if this is the last level in the trie, find the earliest year closest = min(self) return self[closest] else: #if not, find the key with the closest distance closest = min([(distance(x, info[0]), x) for x in self])[1] return self[closest].pickRecurse(track, info[1:])
def findBestResponse(self, input): bestDistance = maxint for query in self.convoDict: tempDist = distance(query, input) if tempDist < bestDistance and len(self.convoDict[query]) > 0: bestResponses = self.convoDict[query] bestDistance = tempDist return choice(bestResponses)
def trim_loc(a, b): """find best edit distance and return its index else return length of a""" la, lb = len(a), len(b) dists = [] for i in xrange(0, la-lb+1): dists.append(ed.distance(a[i:i+lb], b)) best = min(dists) # 20% mismatch okay for now return dists.index(best) if best < .2*lb else la
def find_filename(string, match): editD = 1000 fn = '' for f in match: dis = editdist.distance(string, match[f]) if dis < editD: editD = dis fn = f return f
def calculate_majorseq(self, clusterids=None, table_prefix=None, seq_start_idx=6): """" Calculate majority sequence observed in clusters and what percentage of the cluster this makes up. Note that this can be different to the representative sequence. Also calculates a measure of self similarity, measureing the percentage of reads 1 edit dist away, 2 edit dists away """ if table_prefix is None: members_table_name = 'members' cluster_table_name = 'clusters' else: members_table_name = table_prefix + '_members' cluster_table_name = table_prefix + '_clusters' if clusterids is None: # Find Last cluster id c = self.con.execute(''' SELECT COUNT(*) FROM {0}'''.format(cluster_table_name)) clusterid_max = c.fetchone()['count(*)'] clusterids = range(1, clusterid_max + 1) for cid in clusterids: cluster = self.get_cluster_by_id(cid, items=['seqid', 'seq'], table_prefix=table_prefix) # Fetch all unique seq data and find most common cluster.get_unique_seq(seq_start_idx=seq_start_idx, db=self) majorSeq = cluster.unique_seqs.most_common()[0][0] if majorSeq != cluster.rep_seq[seq_start_idx:]: majorSeqIsRepSeq = False else: majorSeqIsRepSeq = True majorSeqPerc = (cluster.unique_seqs.most_common()[0][1] / float(cluster.size)) * 100 # Calculate metric for self similarity selfsimilarity = [] # First work out lev distance between top 5 unique seqs top5seqs = cluster.unique_seqs.most_common()[:5] # selfsimilarity = [( cumulative_percentage, edit distance), ... ( )] for idx, (seq, count) in enumerate(top5seqs): if idx != 0: perc = (int((count / float(cluster.size)) * 100) * 100) / 100.0 d = ed.distance(majorSeq, seq) selfsimilarity.append((perc, d)) # Update info for cluster with self.con as con: sql_query = '''UPDATE {0} SET majorSeq = ?, majorSeqIsRepSeq = ?, majorSeqPerc = ?, selfsimilarity = ? WHERE clusterid = ?'''.format( cluster_table_name) con.execute(sql_query, (majorSeq, majorSeqIsRepSeq, majorSeqPerc, str(selfsimilarity), cid))
def fuzzy_wellbc_match(obs_wellbc, well_barcodes, start_pos, end_pos): ''' This function takes a read and searches for supplied barcode sequences. Parameters ---------- obs_wellbc: str, fastq sequence before the first instance of AD1 e.g. ATGCATG well_barcodes: list, expected well barcodes e.g. ATGCATG start_pos: int, limits the string search space of the obs_wellbc end_position: int, limits the string search space of the obs_wellbc Returns ------- The expected barcode found in the obs_wellbc OR 'mismatch' (if no barcode is found) ''' assert type(start_pos) == int, 'start_pos should be an int' assert type(end_pos) == int, 'end_pos should be an int' #initializing obs_wellbc variable and set of expected well barcodes FASTQ, bc_set = obs_wellbc.upper(), set(well_barcodes) #limit search for exact matches to [: end_pos] matches = set(FASTQ[n:n + 8] for n in range(0, end_pos) if FASTQ[n:n + 8] in bc_set) #DEPRECATED #[matches.add(FASTQ[n:n+8]) for n in range(0, end_pos) if FASTQ[n:n+8] in bc_set] #RETURNS EXPECTED BARCODE SEQUENCE IF UNIQUE MATCH FOUND if len(matches) == 1: return (0, list(matches)[0]) #return the single best match #BRUTE FORCE SEARCH OF obs_wellbc subsequence else: matches = set() for bars in well_barcodes: #Differentiate between more than 1 exact match, or find fuzzy matches BARS = bars.upper( ) #added a bit of ADAPTOR1 to make the mappings more stringent edist = editdist.distance(FASTQ, BARS) delta_8 = abs(len(FASTQ) - 8) #correcting for difference in string seq lengths if edist < 2: return (edist, BARS) else: if edist + delta_8 < 2: #looser thresholds performed poorly and only added maybe a couple hundred reads out of a million matches.add((edist, bars)) if len(matches) > 0: return (len(matches), ";".join([i[1] for i in matches])) return (8, "mismatch")
def f(rec): ''' Filter function for cutsite''' cutsite = rec.seq[6: 6 + cutsite_length].tostring() if cutsite.endswith(overhang): return True else: overhang_dist = ed.distance(cutsite[-overhang_length:], overhang) return overhang_dist <= mindist
def extract_translations(dict_csv_file): reader = csv.reader(open(dict_csv_file), dialect=csv.excel) headers = {} translations_by_worker = {} worker_stats = {} for i, header in enumerate(reader.next()): headers[header] = i for row in reader: workerID = row[headers['WorkerId']] status = row[headers['AssignmentStatus']] if status == 'Approved': for i in range(1, 13): word = row[headers['Input.word_' + str(i)]].decode('utf8') translation = row[headers['Answer.translation_' + str(i) + '_1']].decode('utf8') if not word in translations_by_worker: translations_by_worker[word] = {} translations_by_worker[word][workerID] = translation if (i <= 2): gold = row[headers['Input.translation_' + str(i)]].decode('utf8') try: edit_distance = float( editdist.distance(gold.lower(), translation.lower())) / len(gold) except: edit_distance = 1 if not workerID in worker_stats: worker_stats[workerID] = {} worker_stats[workerID]['num_translations'] = 0 worker_stats[workerID]['total_edit_distance'] = 0 worker_stats[workerID]['num_translations'] += 1 worker_stats[workerID][ 'total_edit_distance'] += edit_distance # calculate the performance of each worker for workerID in worker_stats: num_translations = worker_stats[workerID]['num_translations'] total_edit_distance = worker_stats[workerID]['total_edit_distance'] avg_edit_distance = total_edit_distance / num_translations worker_stats[workerID]['avg_edit_distance'] = avg_edit_distance # extract the best translations best_translations = {} for word in translations_by_worker: best_translation = '' best_edit_distance = 1000 for workerID in translations_by_worker[word]: if worker_stats[workerID][ 'avg_edit_distance'] <= best_edit_distance: best_translation = translations_by_worker[word][workerID] best_edit_distance = worker_stats[workerID][ 'avg_edit_distance'] if best_translation != '': best_translation = best_translation.replace(' ', '_') best_translations[word] = best_translation print len(best_translations.keys()) return best_translations
def matchAuthors_strict_v1(self, google_author_string, authors, debug_output=False): '''If the two author string matched, return True @return: boolean @param: - google_author_string, e.g. … DeSmedt, W Du, W Kent, MA Ketabchi, WA … - …, 1991 - doi.ieeecomputersociety.org R Ahmed, P DeSmedt, W Du, W Kent, MA … - …, 1991 - doi.ieeecomputersociety.org - authors, e.g. Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan ''' ignore_sign = '…' # ignore_sign = '…' # process google part mark = google_author_string.find(' - ') if mark != -1: google_author_string = google_author_string[:google_author_string.find(' - ')] google_author_string = re.sub("(<(.*?)>)", "", google_author_string) google_author_string = re.sub("[^A-Za-z0-9,\s%s]" % ignore_sign, "", google_author_string) google_author_string = re.sub("\\s+", " ", google_author_string) google_author_string = google_author_string.strip() ignore_left = google_author_string.startswith(ignore_sign) ignore_right = google_author_string.endswith(ignore_sign) compact_google_str = self.__trans_to_compact(google_author_string, ignore_sign) # print '--- ', compact_google_str # process author part compact_authors = self.__trans_to_compact(authors, ignore_sign); # print ',,, ', compact_authors # compare cmp_gc = '' cmp_db = '' if ignore_left and not ignore_right: # and compact_authors.endswith(compact_google_str): cmp_gc = compact_google_str cmp_db = compact_authors[-len(compact_google_str):] elif not ignore_left and ignore_right: # and compact_authors.startswith(compact_google_str): cmp_gc = compact_google_str cmp_db = compact_authors[:len(compact_google_str)] elif ignore_left and ignore_right and compact_authors.find(compact_google_str) != -1: return True # todo elif not ignore_left and not ignore_right: # and compact_authors == compact_google_str: cmp_gc = compact_google_str cmp_db = compact_authors else: return False edd = editdist.distance(cmp_gc, cmp_db) if edd > 0: if debug_output: print '[ERR] editdist for "%s" and "%s" is %s' % (cmp_gc, cmp_db, edd) if edd <= 2: return True
def main(argv): filenames = argv[1:] words = [] for filename in tqdm(filenames): with open(filename, 'r') as f: for line in f: bits = line.strip().split() for bit in bits: words.extend(bit.split('--')) sentences = [] sentence = [] for word in tqdm(words): if word.startswith(('"', "'")): word = word[1:] if word.endswith(('"', "'")): word = word[:-1] sentence.append(word) if word not in ('Mr.', 'Mrs.', 'Dr.') and word.endswith( ('.', '!', '?')): sentences.append(sentence) sentence = [] sentences.append(sentence) for sentence in sentences: distances = {} # frozenset of two (word, pos) tuples -> distance for (pos1, word1) in enumerate(sentence): for (pos2, word2) in enumerate(sentence): if word1 == word2: continue if MIN_LENGTH: if len(word1) < MIN_LENGTH or len(word2) < MIN_LENGTH: continue dist = editdist.distance(word1, word2) pair = frozenset([(word1, pos1), (word2, pos2)]) if pair in distances: assert distances[pair] == dist distances[pair] = dist smallest_distance = 100000000 smallest_pair = None for pair, distance in distances.iteritems(): if distance < smallest_distance: smallest_distance = distance smallest_pair = pair if smallest_pair is not None: smallest_pair = list(smallest_pair) (word1, pos1) = smallest_pair[0] (word2, pos2) = smallest_pair[1] sentence[pos2] = word1 sentence[pos1] = word2 print ' '.join(sentence)
def same(d1, d2, value_of_same=0.1): #dis = editdist.distance(d1.upper(), d2.upper().encode('utf-8')) #editdistance can accept two str or two unicode but if mismatch it will convert using ascii which doesn't work dis = editdist.distance(unicode_to_str(remove_uneeded(d1.upper())), unicode_to_str(remove_uneeded(d2.upper())) ) if len(d2)> len(d1): longest_len = len(d2) else: longest_len =len(d1) levensthein_to_len = (1.0 * dis)/ longest_len return levensthein_to_len < value_of_same
def tokens_are_similar(t1,t2): similarity = 1 - 1.0 * editdist.distance(t1, t2) / max(len(t1), len(t2)) if similarity == 1 or (len(t1) > 3 and similarity > 0.5): return True # dist = embeddings_dist(t1, t2) # if dist < 0.001: # print 'embeddings think those are similar: {0}, {1}'.format(t1, t2) # return True return False
def cutsite_filter(rec): """ Filter function for cutsite """ cutsite = rec.seq[midtag_length: midtag_length + cutsite_length].tostring() for target_site in target_cutsites: cutsite_dist = editdist.distance(target_site, cutsite) if cutsite_dist <= max_edit_dist: return True return False
def fuzzy_wellbc_match(obs_wellbc, well_barcodes, start_pos, end_pos): ''' This function takes a read and searches for supplied barcode sequences. Parameters ---------- obs_wellbc: str, fastq sequence before the first instance of AD1 e.g. ATGCATG well_barcodes: list, expected well barcodes e.g. ATGCATG start_pos: int, limits the string search space of the obs_wellbc end_position: int, limits the string search space of the obs_wellbc Returns ------- The expected barcode found in the obs_wellbc OR 'mismatch' (if no barcode is found) ''' assert type(start_pos) == int, 'start_pos should be an int' assert type(end_pos) == int, 'end_pos should be an int' #initializing obs_wellbc variable and set of expected well barcodes FASTQ, bc_set = obs_wellbc.upper(), set(well_barcodes) #limit search for exact matches to [: end_pos] matches = set(FASTQ[n:n+8] for n in range(0, end_pos) if FASTQ[n:n+8] in bc_set) #DEPRECATED #[matches.add(FASTQ[n:n+8]) for n in range(0, end_pos) if FASTQ[n:n+8] in bc_set] #RETURNS EXPECTED BARCODE SEQUENCE IF UNIQUE MATCH FOUND if len(matches) == 1: return (0, list(matches)[0]) #return the single best match #BRUTE FORCE SEARCH OF obs_wellbc subsequence else: matches = set() for bars in well_barcodes: #Differentiate between more than 1 exact match, or find fuzzy matches BARS = bars.upper() #added a bit of ADAPTOR1 to make the mappings more stringent edist = editdist.distance(FASTQ, BARS) delta_8 = abs(len(FASTQ) - 8) #correcting for difference in string seq lengths if edist < 2: return (edist, BARS) else: if edist + delta_8 < 2: #looser thresholds performed poorly and only added maybe a couple hundred reads out of a million matches.add((edist, bars)) if len(matches) >0: return (len(matches), ";".join([i[1] for i in matches]) ) return (8, "mismatch")
def overhang_filter(rec): ''' Filter function for cutsite''' cutsite = rec.seq[midtag_length: midtag_length + cutsite_length].tostring() for i, pat in enumerate(overhang_patterns): dist = editdist.distance(target_cutsites[i], cutsite) if dist <= max_edit_dist: if cutsite.endswith(pat): return True return False
def output_word(word, words): best_x = None best_dist = 1000000000 for x, candidate in enumerate(words): dist = editdist.distance(word, candidate) if dist < best_dist: best_dist = dist best_x = x if best_dist == 0: break chosen = words.pop(best_x) sys.stdout.write(chosen + ' ') sys.stdout.flush() # 'cos it's a bit pokey :)
def surname_compatibility(sa, sb): name_comparison_print('|-- Comparing surnames: %s %s'% (sa,sb)) MAX_ALLOWED_SURNAME_DISTANCE_PERCENT = 0.33 sa = clean_name_string(sa, replacement='', keep_whitespace=False, trim_whitespaces=True) sb = clean_name_string(sb, replacement='', keep_whitespace=False, trim_whitespaces=True) dist = distance(sa, sb) ml = float(max(len(sa),len(sb))) name_comparison_print('|--- dist:%s, ml:%s' % (dist,ml)) if ml==0 or dist/ml > MAX_ALLOWED_SURNAME_DISTANCE_PERCENT: return 0.0 else: return 1.-float(dist)/max(len(sa),len(sb))
def test_levenshtein(class_data, query): all_levenshtein = [] for i, class_datum in enumerate(class_data): class_title = class_datum.get('full_title') all_levenshtein.append((i, editdist.distance(query, class_title.encode('utf-8')))) all_levenshtein_sorted = sorted(all_levenshtein,key=lambda x: x[1]) top_lev = [] for i, tup in enumerate(all_levenshtein_sorted): pos, lev = tup print "{0}: '{1}'' with levenshtein distance of {2}".format(i, class_data[pos].get('full_title'), lev) top_lev.append(class_data[pos].get('title')) if i == 9: break
def _min_names_screwup_list(nalo, nalt): nalo = list(nalo) nalt = list(nalt) sl = [] for n in nalo: maxs = max(len(n), max((len(k) for k in nalt))) all_scr = [distance(n,k) for k in nalt] mins = min(all_scr) sl.append((mins,maxs)) nalt.pop(all_scr.index(mins)) if len(nalt) < 1: break return sl
def _min_names_screwup_list(nalo, nalt): nalo = list(nalo) nalt = list(nalt) sl = [] for n in nalo: maxs = max(len(n), max((len(k) for k in nalt))) all_scr = [distance(n, k) for k in nalt] mins = min(all_scr) sl.append((mins, maxs)) nalt.pop(all_scr.index(mins)) if len(nalt) < 1: break return sl
def get_primer(query, primers, n): """return the primer name and the length to trim.""" primer = "" distance = n + 1 for name, target in primers.iteritems(): d = ed.distance(query[:len(target)], target) if d < distance: distance = d primer = name if distance < n: return primer, len(primers[primer]) else: return False, False
def get_levenshtein_candidates(self, test_token="token", token2lemmas={}): """ Function returns an initial, rough selection of levenshtein candidates """ candidates = [] for train_token in token2lemmas: train_lemmas = token2lemmas[train_token] # calculate the edit distance between the test_token and all seen tokens edit_dist = editdist.distance(test_token, train_token) # append the training item as a candidate if it is close enough: if edit_dist <= self.max_lev_dist: candidates.append([train_token, train_lemmas, edit_dist]) if candidates: return candidates
def cutsite_filter(rec): ''' Filter function for cutsite ''' fname = self.current_file if cutsite_filter.target_file is None or cutsite_filter.target_file != fname: cutsite_filter.target_file = fname tags = self.get_data4file(fname, fields=['MIDtag']) cutsite_filter.MIDlength = len(tags[0][0]) cutsite = rec.seq[cutsite_filter.MIDlength: cutsite_filter.MIDlength + cutsite_length].tostring() cutsite_dist = ed.distance(target_cutsite, cutsite) return cutsite_dist <= max_edit_dist
def compareNamesLc(name, myText): myTextOrig = myText nameOrig = name nameIsAcronym = 0 myTextOrigList = myText.split() if name.isupper(): nameIsAcronym = 1 else: myText = string.lower(myText).strip() name = string.lower(name).strip() nameList = name.split() lengthName = len(nameList) myTextList = myText.split() lengthText = len(myTextList) resultHits = [] for t in range(0, lengthText - lengthName + 1): testName = " ".join(myTextList[t:t + lengthName]) if testName == name: # exact match if nameIsAcronym: testNameOrig = " ".join(myTextOrigList[t:t + lengthName]) if testNameOrig.isupper(): # match only with acronyms! resultHits.append([t, t + lengthName]) else: resultHits.append([t, t + lengthName]) else: # fuzzy match charactersName = list(name) charactersMyText = list(testName) maxDistance = max(1, len(charactersName) / 7) # just some heuristic if len(charactersName) < 2 or abs( len(charactersName) - len(charactersMyText)) > maxDistance: continue distanceNames = editdist.distance(name, testName) if distanceNames > maxDistance: continue else: if nameIsAcronym: # only exact matches for acronyms testNameOrig = " ".join(myTextOrigList[t:t + lengthName]) if testNameOrig.isupper(): # match only with acronyms! logger.info("found similar but not equal names: " + name + " - " + testNameOrig) resultHits.append([t, t + lengthName]) else: logger.info("found similar but not equal names: " + name + " - " + testName) resultHits.append([t, t + lengthName]) return resultHits
def match_index(t,idx_d,idx_len=None,mismatch_allowed=1): '''given an index read sequence a dictionary of form {"<read_sequence>":"<index_number>" ...} returns <index_number> if the best match is the only index within mismatch_allowed ''' # removed in variable length setup #if idx_len is None: # idx_len = list(set([len(k) for k in idx_d]))[0] tagdist = sorted([(distance(t_this,t[:len(t_this)]),t_this) for t_this in idx_d.keys()]) if tagdist[0][0] <= mismatch_allowed and tagdist[1][0] > mismatch_allowed: return idx_d[tagdist[0][1]] else: return None
def match_index(t, idx_d, idx_len=None, mismatch_allowed=1): '''given an index read sequence a dictionary of form {"<read_sequence>":"<index_number>" ...} returns <index_number> if the best match is the only index within mismatch_allowed DOES NOT check to make sure all indices are idx_len (even if left to get idx_len, i.e. no idx_len supplied) ''' if idx_len is None: idx_len = list(set([len(k) for k in idx_d]))[0] tagdist = sorted([(distance(t_this, t[:idx_len]), t_this) for t_this in idx_d.keys()]) if tagdist[0][0] <= mismatch_allowed and tagdist[1][0] > mismatch_allowed: return idx_d[tagdist[0][1]] else: return None
def compare_strings(str1, str2): """Compares 2 strings with the Levenshtein distance and returns a normalized value between 0.0 and 1.0 (meaning totally different and exactly the same respectively.""" if is_editdist_loaded: if str1 == str2: return 1.0 max_len = max(len(str1), len(str2)) if max_len == 0: return 0.0 distance = editdist.distance(str1, str2) return (max_len - distance) / float(max_len) else: # the edit distance module is not loadable, we have to fail the comparison # all the strings will be treated as completely different return 0.0
def clean_list(L, sub_dict): i = 0 while i + 1 < len(L): s1 = L[i] s2 = L[i + 1] if s1 == s2[:len(s1)]: suffix = s2[len(s1):] if suffix.strip().startswith(':'): sub_dict[L[i + 1].strip().lower()] = L[i] del L[i + 1] continue elif suffix.startswith(' '): sub_dict[L[i].strip().lower()] = L[i + 1] del L[i] continue d = editdist.distance(s1.lower(), s2.lower()) if (s1.lower() == s2.lower() or d <= 2 or s1.lower().replace(' ', '') == s2.lower().replace(' ', '')): if len(s1) > len(s2): sub_dict[L[i + 1].strip().lower()] = L[i] del L[i + 1] else: sub_dict[L[i].strip().lower()] = L[i + 1] del L[i] continue try: if re.match(s1.replace('.', '[A-Za-z]+'), s2) is not None: sub_dict[L[i].strip().lower()] = L[i + 1] del L[i] continue except: pass i += 1
def initials_compatibility(ia, ib): max_n_initials = max(len(ia), len(ib)) initials_intersection = set(ia).intersection(set(ib)) n_initials_intersection = len(initials_intersection) initials_union = set(ia).union(set(ib)) n_initials_union = len(initials_union) initials_distance = distance("".join(ia), "".join(ib)) name_comparison_print('|-- Comparing initials, %s %s' % (ia, ib)) name_comparison_print('|--- initials distance %s' % (initials_distance)) if n_initials_union > 0: initials_c = float(n_initials_intersection) / float(n_initials_union) else: initials_c = 1 name_comparison_print('|--- initials c %s' % (initials_c)) if len(ia) > len(ib): alo = ia alt = ib else: alo = ib alt = ia lo = len(alo) lt = len(alt) if max_n_initials > 0: initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo)) if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \ float(float(max_n_initials * (max_n_initials + 1)) / 2) initials_distance = float(initials_distance) / max_n_initials else: initials_screwup = 0 initials_distance = 0 name_comparison_print('|--- initials screwup, %s ' % (initials_screwup)) name_comparison_print('|--- initials distance, %s' % (initials_distance)) return max( 0.0, 0.8 * initials_c + 0.1 * (1 - initials_distance) + 0.1 * (1 - initials_screwup))
def matchAuthors_strict_v1(self, google_author_string, authors, debug_output=False): '''If the two author string matched, return True @return: boolean @param: - google_author_string, e.g. … DeSmedt, W Du, W Kent, MA Ketabchi, WA … - …, 1991 - doi.ieeecomputersociety.org R Ahmed, P DeSmedt, W Du, W Kent, MA … - …, 1991 - doi.ieeecomputersociety.org - authors, e.g. Rafi Ahmed,Philippe De Smedt,Weimin Du,William Kent,Mohammad A. Ketabchi,Witold Litwin,Abbas Rafii,Ming-Chien Shan ''' ignore_sign = '…' # ignore_sign = '…' # process google part mark = google_author_string.find(' - ') if mark != -1: google_author_string = google_author_string[:google_author_string. find(' - ')] google_author_string = re.sub("(<(.*?)>)", "", google_author_string) google_author_string = re.sub("[^A-Za-z0-9,\s%s]" % ignore_sign, "", google_author_string) google_author_string = re.sub("\\s+", " ", google_author_string) google_author_string = google_author_string.strip() ignore_left = google_author_string.startswith(ignore_sign) ignore_right = google_author_string.endswith(ignore_sign) compact_google_str = self.__trans_to_compact(google_author_string, ignore_sign) # print '--- ', compact_google_str # process author part compact_authors = self.__trans_to_compact(authors, ignore_sign) # print ',,, ', compact_authors # compare cmp_gc = '' cmp_db = '' if ignore_left and not ignore_right: # and compact_authors.endswith(compact_google_str): cmp_gc = compact_google_str cmp_db = compact_authors[-len(compact_google_str):] elif not ignore_left and ignore_right: # and compact_authors.startswith(compact_google_str): cmp_gc = compact_google_str cmp_db = compact_authors[:len(compact_google_str)] elif ignore_left and ignore_right and compact_authors.find( compact_google_str) != -1: return True # todo elif not ignore_left and not ignore_right: # and compact_authors == compact_google_str: cmp_gc = compact_google_str cmp_db = compact_authors else: return False edd = editdist.distance(cmp_gc, cmp_db) if edd > 0: if debug_output: print '[ERR] editdist for "%s" and "%s" is %s' % (cmp_gc, cmp_db, edd) if edd <= 2: return True
def cluster(inPairs, bcLen, minRealNum=3, minRealFrac=0.1, minMergeFactor=6): uniqueIDs = {} totalCnt = 0 prefixLen = bcLen / 2 misMatchCnt = [0] * (bcLen + 1) distCnt = [0, 0, 0, 0] # garbage collector barcode allNBarcode = "N" * bcLen # group reads by barcode for (readID, barcode) in inPairs: if barcode not in uniqueIDs: uniqueIDs[barcode] = set() uniqueIDs[barcode].add(readID) totalCnt += 1 # count how many times each unique barcode occurs, and get the read count for the most frequent barcode largestUniq = 0 uniqBCCnts = {} barcodeParent = {} childBarcodes = {} for barcode in uniqueIDs: uniqBCCnts[barcode] = len(uniqueIDs[barcode]) barcodeParent[barcode] = "_UNKNOWN_" #ambiguous barcode childBarcodes[barcode] = [] if uniqBCCnts[barcode] > largestUniq: largestUniq = uniqBCCnts[barcode] if allNBarcode not in uniqBCCnts: uniqBCCnts[allNBarcode] = 0 barcodeParent[allNBarcode] = "_SELF_" childBarcodes[allNBarcode] = [] # iteration 1: mark barcodes as real or merge them with other barcodes if they are within 1 bp of a real barcode prefixHash = {} suffixHash = {} sortedBarcodeList = sorted(uniqBCCnts.iteritems(), key=lambda x: x[1], reverse=True) for (bcA, bcACnt) in sortedBarcodeList: prefix = bcA[:prefixLen] suffix = bcA[0 - prefixLen:] if prefix not in prefixHash: prefixHash[prefix] = [] if suffix not in suffixHash: suffixHash[suffix] = [] prefixHash[prefix].append(bcA) suffixHash[suffix].append(bcA) if bcA.find("N") == -1 and uniqBCCnts[ bcA] > minRealFrac * largestUniq and uniqBCCnts[ bcA] >= minRealNum: barcodeParent[bcA] = "_SELF_" # this is a real barcode continue for realBCList in (prefixHash[prefix], suffixHash[suffix]): for bcB in realBCList: if barcodeParent[bcB] != "_SELF_": continue (similar, misMatchPos) = isSimilar(bcB, bcA, bcLen) if similar: barcodeParent[bcA] = bcB childBarcodes[bcB].append(bcA) misMatchCnt[misMatchPos] += bcACnt distCnt[1] += bcACnt break if barcodeParent[bcA] != "_UNKNOWN_": break # already assigned a parent in prefix list # iteration 2: mark barcodes as real or merge them with other barcodes if they are within 1 bp of a another real or merged barcode level2Parent = {} level3Parent = {} for (bcA, bcACnt) in sortedBarcodeList: prefix = bcA[:prefixLen] suffix = bcA[0 - prefixLen:] if barcodeParent[bcA] != "_UNKNOWN_": continue for realBCList in (prefixHash[prefix], suffixHash[suffix]): for bcB in realBCList: if barcodeParent[bcB] == "_SELF_": (similar, misMatchPos) = isSimilar(bcB, bcA, bcLen) if similar: barcodeParent[bcA] = bcB childBarcodes[bcB].append(bcA) misMatchCnt[misMatchPos] += bcACnt distCnt[1] += bcACnt break continue elif barcodeParent[bcB] != "_UNKNOWN_": (similar, misMatchPos) = isSimilar(bcB, bcA, bcLen) if similar: # checking if bcA is within 1 bp of bcB if bcA not in level2Parent: level2Parent[bcA] = set() distCnt[2] += bcACnt level2Parent[bcA].add( barcodeParent[bcB] ) # do not make this parent yet: doing so will cause >2 mismatch links to parents # already assigned a parent: no need to check any further if barcodeParent[bcA] != "_UNKNOWN_" or bcA in level2Parent: break # do a complete global alignment if we do not still find similarity if barcodeParent[bcA] == "_UNKNOWN_" and bcA not in level2Parent: for (bcB, bcBCnt) in sortedBarcodeList: if barcodeParent[bcB] != "_SELF_": continue if bcBCnt < minMergeFactor * bcACnt: break editDistance = editdist.distance(bcB, bcA) if editDistance <= 2: if bcA not in level2Parent: level2Parent[bcA] = set() distCnt[2] += bcACnt level2Parent[bcA].add(bcB) break elif len(bcA) == bcLen - 3: if bcA == bcB[3:] or bcA == bcB[0:bcLen - 3]: if bcA not in level3Parent: level3Parent[bcA] = set() distCnt[3] += bcACnt level3Parent[bcA].add(bcB) break if barcodeParent[ bcA] == "_UNKNOWN_" and bcA not in level2Parent and bcA not in level3Parent: barcodeParent[ bcA] = "_SELF_" # not within 1-bp of any child of any real barcode: this must be real as well # clean up and make level2 parent as the full parent for bcA in uniqBCCnts: if bcA in level2Parent: bcAParent = list(level2Parent[bcA])[ 0] # arbitrarily pick the first one if multiple level2 parents barcodeParent[bcA] = bcAParent childBarcodes[bcAParent].append(bcA) distCnt[2] += uniqBCCnts[bcA] elif bcA in level3Parent: bcAParent = list(level3Parent[bcA])[ 0] # arbitrarily pick the first one if muliple level3 parents barcodeParent[bcA] = bcAParent childBarcodes[bcAParent].append(bcA) distCnt[3] += uniqBCCnts[bcA] #DEBUG #for (bcA, bcACnt) in sortedBarcodeList: # if barcodeParent[bcA] == "_SELF_": # distCnt[0] += bcACnt # print ("\t".join((bcA,str(bcACnt)))) # for bcB in childBarcodes[bcA]: # print("\t\t" + "\t".join((bcB, str(uniqBCCnts[bcB])))) #clusterInfo = [] #clusterInfo.append(totalCnt) #clusterInfo.extend(distCnt) #clusterInfo.extend(misMatchCnt) #print("cluster stats:\t" + "\t".join((str(x) for x in clusterInfo))) # output readDict = {} for (bcA, bcACnt) in sortedBarcodeList: if bcA == allNBarcode: continue if barcodeParent[bcA] == "_SELF_": readDict[bcA] = [] readDict[bcA].extend(uniqueIDs[bcA]) for bcB in childBarcodes[bcA]: readDict[bcA].extend(uniqueIDs[bcB]) # reformat ouput, and hack around a bug in the algorithm above mts = {} for (mt, readIds) in readDict.iteritems(): numReads = len(readIds) for readId in readIds: if readId in mts: # ERROR! - read assigned to more than one MT!!!! #print("umi_cluster: bug in barcode clustering - read assigned to more than one MT centroid, readId: {}, MT1: {}, MT2: {}".format(readId,mts[readId],mt)) continue mts[readId] = (mt, numReads) # done return mts
def compare_names(origin_name, target_name): ''' Compare two names. ''' AUTHORNAMES_UTILS_DEBUG = bconfig.AUTHORNAMES_UTILS_DEBUG MAX_ALLOWED_SURNAME_DISTANCE = 2 if AUTHORNAMES_UTILS_DEBUG: print "\nComparing: ", origin_name, ' ', target_name gendernames = GLOBAL_gendernames name_variations = GLOBAL_name_variations no = split_name_parts(origin_name, True, "", True) nt = split_name_parts(target_name, True, "", True) if AUTHORNAMES_UTILS_DEBUG: print "|- splitted no: ", no print "|- splitted nt: ", nt score = 0.0 surname_dist = distance(no[0], nt[0]) if AUTHORNAMES_UTILS_DEBUG: print "|- surname distance: ", surname_dist if surname_dist > 0: artifact_removal = re.compile("[^a-zA-Z0-9]") fn1 = artifact_removal.sub("", no[0]) fn2 = artifact_removal.sub("", nt[0]) if fn1 == fn2: score = 1.0 else: score = max( 0.0, 0.5 - (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE))) else: score = 1.0 if AUTHORNAMES_UTILS_DEBUG: print '||- surname score: ', score initials_only = ((min(len(no[2]), len(nt[2]))) == 0) only_initials_available = False if len(no[2]) == len(nt[2]) and initials_only: only_initials_available = True if AUTHORNAMES_UTILS_DEBUG: print '|- initials only: ', initials_only print '|- only initials available: ', only_initials_available names_are_equal_composites = False if not initials_only: names_are_equal_composites = full_names_are_equal_composites( origin_name, target_name) if AUTHORNAMES_UTILS_DEBUG: print "|- equal composites: ", names_are_equal_composites max_n_initials = max_n_initials = max(len(no[1]), len(nt[1])) initials_intersection = set(no[1]).intersection(set(nt[1])) n_initials_intersection = len(initials_intersection) initials_union = set(no[1]).union(set(nt[1])) n_initials_union = len(initials_union) initials_distance = distance("".join(no[1]), "".join(nt[1])) if n_initials_union > 0: initials_c = float(n_initials_intersection) / float(n_initials_union) else: initials_c = 1 if len(no[1]) > len(nt[1]): alo = no[1] alt = nt[1] else: alo = nt[1] alt = no[1] lo = len(alo) lt = len(alt) if max_n_initials > 0: initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo)) if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \ float(float(max_n_initials * (max_n_initials + 1)) / 2) initials_distance = initials_distance / max_n_initials else: initials_screwup = 0 initials_distance = 0 score = score - (0.75 * initials_screwup + 0.10 * (1 - initials_c)\ + 0.15 * initials_distance) * (score) if AUTHORNAMES_UTILS_DEBUG: print "|- initials sets: ", no[1], " ", nt[1] print "|- initials distance: ", initials_distance print "|- initials c: ", initials_c print "|- initials screwup: ", initials_screwup print "||- initials score: ", score composits_eq = full_names_are_equal_composites(no, nt) if len(no[2]) > 0 and len(nt[2]) > 0: gender_eq = full_names_are_equal_gender(no, nt, gendernames) else: gender_eq = True vars_eq = full_names_are_synonymous(no, nt, name_variations) substr_eq = full_names_are_substrings(no, nt) if not initials_only: if len(no[2]) > len(nt[2]): nalo = no[2] nalt = nt[2] else: nalo = nt[2] nalt = no[2] nlo = len(nalo) nlt = len(nalt) names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i]))) for i, k in enumerate(reversed(nalo)) \ if nlo - 1 - i < nlt] max_names_screwup = max( [float(i[0]) / i[1] for i in names_screwup_list]) avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\ / len(names_screwup_list) else: max_names_screwup = 0 avg_names_screwup = 0 score = score - score * 0.75 * max_names_screwup - score * 0.25 * avg_names_screwup if AUTHORNAMES_UTILS_DEBUG: print "|- max names screwup: ", max_names_screwup print "|- avg screwup: ", avg_names_screwup print "||- names score: ", score print "|- names composites: ", composits_eq print "|- same gender: ", gender_eq print "|- synonims: ", vars_eq print "|- substrings: ", substr_eq if vars_eq: synmap = [[i, j, names_are_synonymous(i, j, name_variations)] for i in no[2] for j in nt[2]] synmap = [i for i in synmap if i[2] == True] if AUTHORNAMES_UTILS_DEBUG: print "|-- synmap: ", synmap for i in synmap: if no[2].index(i[0]) == nt[2].index(i[1]): score = score + (1 - score) * 0.5 else: score = score + (1 - score) * 0.15 else: if AUTHORNAMES_UTILS_DEBUG: print "|-- synmap: empty" if AUTHORNAMES_UTILS_DEBUG: print "|-- synmap score: ", score if substr_eq and not initials_only: ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2] for j in nt[2]] ssmap = [i for i in ssmap if i[2] == True] if AUTHORNAMES_UTILS_DEBUG: print "|-- substr map: ", ssmap for i in ssmap: if no[2].index(i[0]) == nt[2].index(i[1]): score = score + (1 - score) * 0.2 else: score = score + (1 - score) * 0.05 else: if AUTHORNAMES_UTILS_DEBUG: print "|-- substr map: empty" if AUTHORNAMES_UTILS_DEBUG: print "|-- substring score: ", score if composits_eq and not initials_only: if AUTHORNAMES_UTILS_DEBUG: print "|-- composite names" score = score + (1 - score) * 0.2 else: if AUTHORNAMES_UTILS_DEBUG: print "|-- not composite names" if AUTHORNAMES_UTILS_DEBUG: print "|-- composite score: ", score if not gender_eq: score = score / 3. if AUTHORNAMES_UTILS_DEBUG: print "|-- apply gender penalty" else: if AUTHORNAMES_UTILS_DEBUG: print "|-- no gender penalty" if AUTHORNAMES_UTILS_DEBUG: print "|-- gender score: ", score if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE: score = 0.0 if AUTHORNAMES_UTILS_DEBUG: print "|- surname trim: ", score else: if AUTHORNAMES_UTILS_DEBUG: print "|- no surname trim: ", score if initials_only and not only_initials_available: score = score * .9 if AUTHORNAMES_UTILS_DEBUG: print "|- initials only penalty: ", score, initials_only, only_initials_available else: if AUTHORNAMES_UTILS_DEBUG: print "|- no initials only penalty", initials_only, only_initials_available if AUTHORNAMES_UTILS_DEBUG: print "||- final score: ", score return score
def assign_read_to_indiv(line,indiv_data,mismatch_allowed=1, \ indiv_reads_out_pattern=None,fhdict=None,passfh=None,read2_has_idx=None, \ trim_Q2=False,min_readlen=None,lnum=4,output_lnum=4,baseQ_in=None,baseQ_out=None): '''given a fastq line (actually a list of [read_name,seq,qual_str]), and an indiv_data object (see get_individual_data_for_lane) assigns the read to an individual based on the index tag, strips the index sequence and quality positions, converts quality to list of integers, and returns the sampleid, sequence and quality if a pattern is specified for output (like "/path/to/per-indiv-data/%s_s_1_1_sequence.txt") will also generate per-individual fastqs. using a single fhdict and passfh is highly recommended (i.e. creating beforehand and passing as arguments), but will be generated if absent. FUTURE PLANS: if min_readlen is set, will "pass" reads shorter than min_readlen if trim_Q2 is True will remove all terminal quality 2 bases. If this reduces a read to less then min_readlen good bases, sends to pass returns indiv,read,qual Paired-Ends (PE) HANDLING: if line and indiv_reads_out_pattern are 2-tuples, treats reads as paired-end. This requires that read2_has_idx be either True or False if False, both reads handled per the index bases of line[0] if True, both reads assesssed for index bases, if they DO NOT DISAGREE both reads handled per consensus fhdict keys for PE (line is 2-tuple) are 2-tuples (<indiv>,<readnum>) i.e. (BW001,1) if passfh supplied, must also be 2-tuple returns indiv, (read1, read2), (q1, q2) ''' idxlen = len(indiv_data.keys()[0]) if isinstance(line, tuple) and len(line) == 2: if (isinstance(indiv_reads_out_pattern, tuple) and len(indiv_reads_out_pattern) == 2) or indiv_reads_out_pattern is None: if read2_has_idx is not None: if indiv_reads_out_pattern is not None: if fhdict is None: fhdict = {} if passfh is None: passfh = [ smartopen(p % 'pass', 'w') for p in indiv_reads_out_pattern ] indiv = None heads = [l[0] for l in line] ss = [l[1] for l in line] qstrs = [l[2] for l in line] if baseQ_in is None: bqs = list( set([ get_baseQ(qs) for qs in qstrs if get_baseQ(qs) is not None ])) if len(bqs) == 1: baseQ_in = bqs[0] else: raise ValueError, 'bqs: %s' % bqs if baseQ_out is None: baseQ_out = baseQ_in if len(set([h.split()[0][:-1] for h in heads])) != 1: raise ValueError, 'read headers not identical prior to last character; %s' % heads if read2_has_idx: #check that indices are concordant ts = [s[:idxlen] for s in ss] tqs = [qstr[:idxlen] for qstr in qstrs] tagdists = [ sorted([(distance(t_this, t), t_this) for t_this in indiv_data.keys()]) for t in ts ] try: indiv_cand = [indiv_data[tagdist[0][1]]['sampleid'] for tagdist in tagdists \ if tagdist[0][0] <= mismatch_allowed and tagdist[1][0] > mismatch_allowed] except: indiv_cand = [indiv_data[tagdist[0][1]]['sampleid2'] for tagdist in tagdists \ if tagdist[0][0] <= mismatch_allowed and tagdist[1][0] > mismatch_allowed] if len(set(indiv_cand)) == 1: indiv = indiv_cand[0] read = [s[idxlen:] for s in ss] qual = [[ord(c) - baseQ_in for c in qstr[idxlen:]] for qstr in qstrs] else: #dump both reads per the first t = ss[0][:idxlen] #tag from read1 ts = [t] * 2 # hack for getting tag into both reads, below tqs = [qstrs[0][:idxlen]] * 2 tagdist = sorted([(distance(t_this, t), t_this) for t_this in indiv_data.keys()]) if tagdist[0][0] <= mismatch_allowed and tagdist[1][ 0] > mismatch_allowed: indiv = indiv_data[tagdist[0][1]]['sampleid'] read = [ss[0][idxlen:], ss[1]] qual = [[ord(c) - baseQ_in for c in qstrs[0][idxlen:]], [ord(c) - baseQ_in for c in qstrs[1]]] if indiv is None: read = ss qual = [[ord(c) - baseQ_in for c in qstr] for qstr in qstrs] if passfh is not None: for id, s, q, fh in zip(heads, read, qual, passfh): fh.write( as_fq_line( id, s, q, baseQ_out, output_lnum, )) else: if indiv_reads_out_pattern is not None: for h, t, tq, s, q, rn, pat in zip( heads, ts, tqs, read, qual, [1, 2], indiv_reads_out_pattern): newhead = '%s %s:%s' % (h, t, tq) try: fhdict[(indiv, rn)].write( as_fq_line(newhead, s, q, baseQ_out, output_lnum)) except KeyError: fhdict[(indiv, rn)] = smartopen(pat % indiv, 'w') fhdict[(indiv, rn)].write( as_fq_line(newhead, s, q, baseQ_out, output_lnum)) qual = [numpy.array(q, dtype=int) for q in qual] else: raise ValueError, 'read2_has_idx cannot be None for PE reads' else: raise ValueError, 'PE handling invoked, but indiv_out_pattern does not match; must be 2-tuple or None, is: %s' % indiv_reads_out_pattern else: if indiv_reads_out_pattern is not None: if fhdict is None: fhdict = {} if passfh is None: passfh = smartopen(indiv_reads_out_pattern % 'pass', 'w') head, s, qstr = line if baseQ_in is None: if get_baseQ(qstr) is None: raise ValueError, 'could not determine qual base (33 or 64): %s' % qstr else: baseQ_in = get_baseQ(qstr) if baseQ_out is None: baseQ_out = baseQ_in t = s[:idxlen] tagdist = sorted([(distance(t_this, t), t_this) for t_this in indiv_data.keys()]) if tagdist[0][0] <= mismatch_allowed and tagdist[1][ 0] > mismatch_allowed: indiv = indiv_data[tagdist[0][1]]['sampleid'] read = s[idxlen:] qual = [ord(c) - baseQ_in for c in qstr[idxlen:]] if indiv_reads_out_pattern is not None: newhead = '%s:%s:%s' % (head, t, qstr[:idxlen]) try: fhdict[indiv].write( as_fq_line(newhead, read, qual, baseQ_out, output_lnum)) except KeyError: fhdict[indiv] = smartopen(indiv_reads_out_pattern % indiv, 'w') fhdict[indiv].write( as_fq_line(newhead, read, qual, baseQ_out, output_lnum)) else: indiv = None read = s qual = [ord(c) - baseQ_in for c in qstr] if passfh is not None: passfh.write(as_fq_line(head, s, qual, baseQ_out, output_lnum)) qual = numpy.array(qual, dtype=int) return indiv, read, qual
def test_01__reversed_test_vectors(self): for b, a, score in test_vectors: self.assertEqual(editdist.distance(a, b), score)
def string_match_score(p1, p2, field): s1 = p1[field] s2 = p2[field] return editdist.distance(s1.lower(), s2.lower()) / float(len(s1))
def matchPub(self, golist, mylist, aid): data = [] fdr = open('./output/data.txt', 'r') for i in fdr.readlines(): data.append(i) fdr.close() cursor_my = self.conn_my.cursor() print 'matching' ''' automatically updates the database return a list of paper dics containing updating information ''' table_mon = self.db_mon[self.table_mon] godics = golist mytitles = mylist[0] aid = mylist[1] print_not_matched = False pubs_matched = [] pubs_not_matched = [] #fw1 = open('C:\\Python27\\tutorial\\tutorial\\test\\%dmatched.txt'%aid,'w') #fw2 = open('C:\\Python27\\tutorial\\tutorial\\test\\%dfailed.txt'%aid,'w') fw3 = open('./output/%dmulmatched.txt' % aid, 'w') t = 0 for mydic in mytitles: ncitation = mydic['ncitation'] mytitle = mydic['title'] mytitleCleaned = self.cleanGoogleTitle(mytitle) short_key = mytitleCleaned[1] matchedlist = [] pid = mydic['pid'] pid = pid[0] for godic in godics: start = time.time() gotitle = godic['title'] _gotitle = '' for cha in gotitle: if cha <= chr(127): _gotitle = _gotitle + cha gotitleCleaned = self.cleanGoogleTitle(gotitle) key_title = gotitleCleaned[1] has_dot = gotitleCleaned[2] exactmatched = False if has_dot: if key_title.find(short_key) != -1: exactmatched = True matchedlist.append(godic) godics.remove(godic) else: if key_title == short_key: exactmatched = True matchedlist.append(godic) godics.remove(godic) if not exactmatched: #if can not be critical matched, try by calculate Levenshtein distance ed = editdist.distance(short_key, key_title) if ed < 10: #adaptable looseValue = float(len(key_title)) * (10 / float(100)) if looseValue > ed: matchedlist.append(godic) godics.remove(godic) end = time.time() if (start - end) != 0: t += 1 if len(matchedlist) == 1: try: pubs_matched.append({ 'title': matchedlist[0]['title'], 'pid_in_mysql': pid, 'citation': matchedlist[0]['citation'], 'essay_others': godic['essay_others'] }) #fw1.write('title1:%s title2:%s citation:%s ncitation:%s pid%d\n'%(mytitle,matchedlist[0]['title'],matchedlist[0]['citation'],ncitation,pid[0])) except: pass #fw1.write('title1:%s citation:%s ncitation:%s pid%d\n'%(mytitle,matchedlist[0]['citation'],ncitation,pid[0])) elif len(matchedlist) >= 2: same = False num = len(matchedlist) for i in range(0, num): if i == num - 1: same = True break if matchedlist[i] == matchedlist[i + 1]: continue else: break if same: try: pubs_matched.append({ 'title': matchedlist[0]['title'], 'pid_in_mysql': pid, 'citation': matchedlist[0]['citation'], 'essay_others': godic['essay_others'] }) except: pass else: for paper in matchedlist: godics.append(paper) fw3.write( 'title1:%s citation:%s ncitation:%s pid%d\n' % (mytitle, matchedlist[0]['citation'], ncitation, pid)) else: #fw2.write('title:%s citation:-1\n'%mytitle) #pubs_not_matched.append({'title':matchedlist[0]['title'],'citation':matchedlist[0]['citation'],'essay_others':godic['essay_others']}) continue #fw1.close() #fw2.close() fw3.close() fdw = open('./output/data.txt', 'w') log = '%d %d %d %f %d \n' % (aid, len(pubs_matched), len(mytitles), float(len(pubs_matched)) / float(len(mytitles)), t) print log data.append(log) for i in data: fdw.write(i) fdw.close() pubs_matched.extend(godics) return pubs_matched