示例#1
0
def maj_argus_zoe(db_zoe, df_argus):
    for el in db_zoe['id']:
        text_description = db_zoe[db_zoe['id'] == el]['description'].values
        text_version = db_zoe[db_zoe['id'] == el]['version'].values
        year_tmp = db_zoe[db_zoe['id'] == el]['year'].values[0]
        if str(year_tmp)[:2] == '20' and year_tmp != '':
            df_argus_tmp = df_argus[df_argus.year == year_tmp]
        else:
            df_argus_tmp = df_argus

        for el_argus in df_argus_tmp['version'].values:
            df_argus_sub_tmp = df_argus_tmp

            # Recherche de la version de zoe 'La centrale' la plus proche de la version leboncoin (titre/decription)
            df_argus_tmp.loc[df_argus_tmp.version == el_argus,
                             'select'] = pylev.levenshtein(
                                 text_description, str(el_argus))
            df_argus_sub_tmp.loc[df_argus_sub_tmp.version == el_argus,
                                 'select'] = pylev.levenshtein(
                                     text_version, str(el_argus))
            df_argus_fin_tmp = df_argus_tmp.append(df_argus_sub_tmp)
            distance_min = df_argus_fin_tmp['select'].min()

            argus_price = df_argus_fin_tmp[df_argus_fin_tmp['select'] ==
                                           distance_min]['argus'].values[0]
            db_zoe.loc[db_zoe.id == el, 'argus'] = float(argus_price)
            db_zoe.loc[db_zoe.id == el, 'official_version'] = el_argus
    return db_zoe
示例#2
0
def isIndexRevComp(indexfile, indexes, n=500000):
    """Determine if the indexes are reverse complemented or not
    
    :param indexfile: filename of the Fastq index file
    :param indexes: list or tuple of index strings
    :param n: integer number of reads to sample
    """
    print("HERE")
    ifile = Fastq(indexfile)
    ilength = len(indexes[0])
    print(ilength)
    indexreads = collections.defaultdict(int)
    for i in range(n):
        indexreads[ifile.next().sequence[:ilength]] += 1
    counts = {'normal': 0, 'revcomp': 0}
    for k, v in list(indexreads.items()):
        print(k, v)
        for i in indexes:
            if (pylev.levenshtein(k, i) <= 1):
                counts['normal'] += v
                continue
            if (pylev.levenshtein(k, revcomp(i)) <= 1):
                counts['revcomp'] += v
    if (counts['revcomp'] > counts['normal']):
        print('using revcomp')
    else:
        print('NOT revcomp')

    return (counts['revcomp'] > counts['normal'])
def equal_levenshtein(string1: str, string2: str, min_index: int) -> int:
	"""recursive levenshtein. only looks at the next number of chars that could
	mean that the levenshtein distance of this string pair is higher than the
	minimum required. If it is not, calls itself on the rest of the strings.
	This way, we not check all the strings at once but do it in sections since
	the minimum distance may already be reached after checking half of the
	string, saving valuable computing resources (levenshtein has complexity
	O(m*n) where m and n are the lengths of the two strings compared)"""

	length = len(string1)
	if length <= min_index:
		return pylev.levenshtein(string1, string2)
	else:
		index = pylev.levenshtein(string1[0:min_index+1], string2[
				0:min_index+1])
		if index > min_index:
			return False
		else:
			sub_index = equal_levenshtein(string1[min_index+1:], string2[min_index+1:], min_index-index)
			if sub_index is False:
				return False
			else:
				index += sub_index
				if index > min_index:
					return False
				else:
					return index
示例#4
0
def isIndexRevComp(indexfile,indexes,n=500000):
    """Determine if the indexes are reverse complemented or not
    
    :param indexfile: filename of the Fastq index file
    :param indexes: list or tuple of index strings
    :param n: integer number of reads to sample
    """
    print("HERE")
    ifile = Fastq(indexfile)
    ilength=len(indexes[0])
    print(ilength)
    indexreads = collections.defaultdict(int)
    for i in range(n):
        indexreads[ifile.next().sequence[:ilength]]+=1
    counts = {'normal':0,
              'revcomp':0}
    for k,v in list(indexreads.items()):
        print(k,v)
        for i in indexes:
            if(pylev.levenshtein(k,i)<=1):
                counts['normal']+=v
                continue
            if(pylev.levenshtein(k,revcomp(i))<=1):
                counts['revcomp']+=v
    if(counts['revcomp']>counts['normal']):
        print('using revcomp')
    else:
        print('NOT revcomp')
        
    return(counts['revcomp']>counts['normal'])
示例#5
0
    def lev_distance(q1, q2, process):
        if process:
            lev = float(levenshtein(' '.join(q1), ' '.join(q2)))
            return [lev / float(max(1, len(' '.join(q1)) + len(' '.join(q2)))),
                   lev / float(max(1, min(len(' '.join(q1)), len(' '.join(q2))))),
                   lev / float(max(1, max(len(' '.join(q1)), len(' '.join(q2)))))]

        else:
            lev = float(levenshtein(q1, q2))
            return [lev / float(max(1, len(q1) + len(q2))), 
                   lev / float(max(1, min(len(q1), len(q2)))),
                   lev / float(max(1, max(len(q1), len(q2))))]
示例#6
0
文件: application.py 项目: hason/cleo
    def find_alternatives(self, name, collection):
        """
        Finds alternatives of name in collection

        @param name: The string
        @type name: str
        @param collection: The collection
        @type collection: list

        @return: A sorted list of similar strings
        """
        threshold = 1e3
        alternatives = {}

        collection_parts = {}
        for item in collection:
            collection_parts[item] = item.split(':')

        for i, subname in enumerate(name.split(':')):
            for collection_name, parts in collection_parts.items():
                exists = collection_name in alternatives
                if i not in parts and exists:
                    alternatives[collection_name] += threshold
                    continue
                elif i not in parts:
                    continue

                lev = levenshtein(subname, parts[i])
                if lev <= (len(subname) / 3) or parts[i].find(subname) != -1:
                    if exists:
                        alternatives[collection_name] = alternatives[
                            collection_name] + lev
                    else:
                        alternatives[collection_name] = lev
                elif exists:
                    alternatives[collection_name] += threshold

        for item in collection:
            lev = levenshtein(name, item)
            if lev <= (len(name) / 3) or item.find(name) != -1:
                if item in alternatives:
                    alternatives[item] = alternatives[item] - lev
                else:
                    alternatives[item] = lev

        alternatives = list(
            filter(lambda a: a[1] < 2 * threshold, alternatives.items()))
        sorted(alternatives, key=lambda x: x[1])

        return list(map(lambda x: x[0], alternatives))
示例#7
0
    def find_alternatives(self, name, collection):
        """
        Finds alternatives of name in collection

        :param name: The string
        :type name: str
        :param collection: The collection
        :type collection: list

        :return: A sorted list of similar strings
        """
        threshold = 1e3
        alternatives = {}

        collection_parts = {}
        for item in collection:
            collection_parts[item] = item.split(':')

        for i, subname in enumerate(name.split(':')):
            for collection_name, parts in collection_parts.items():
                exists = collection_name in alternatives
                if i not in parts and exists:
                    alternatives[collection_name] += threshold
                    continue
                elif i not in parts:
                    continue

                lev = levenshtein(subname, parts[i])
                if lev <= (len(subname) / 3) or parts[i].find(subname) != -1:
                    if exists:
                        alternatives[collection_name] += lev
                    else:
                        alternatives[collection_name] = lev
                elif exists:
                    alternatives[collection_name] += threshold

        for item in collection:
            lev = levenshtein(name, item)
            if lev <= (len(name) / 3) or item.find(name) != -1:
                if item in alternatives:
                    alternatives[item] = alternatives[item] - lev
                else:
                    alternatives[item] = lev

        alts = []
        for alt, score in alternatives.items():
            if score < 2 * threshold:
                alts.append(alt)

        return alts
示例#8
0
文件: application.py 项目: hason/cleo
    def find_alternatives(self, name, collection):
        """
        Finds alternatives of name in collection

        @param name: The string
        @type name: str
        @param collection: The collection
        @type collection: list

        @return: A sorted list of similar strings
        """
        threshold = 1e3
        alternatives = {}

        collection_parts = {}
        for item in collection:
            collection_parts[item] = item.split(':')

        for i, subname in enumerate(name.split(':')):
            for collection_name, parts in collection_parts.items():
                exists = collection_name in alternatives
                if i not in parts and exists:
                    alternatives[collection_name] += threshold
                    continue
                elif i not in parts:
                    continue

                lev = levenshtein(subname, parts[i])
                if lev <= (len(subname) / 3) or parts[i].find(subname) != -1:
                    if exists:
                        alternatives[collection_name] = alternatives[collection_name] + lev
                    else:
                        alternatives[collection_name] = lev
                elif exists:
                    alternatives[collection_name] += threshold

        for item in collection:
            lev = levenshtein(name, item)
            if lev <= (len(name) / 3) or item.find(name) != -1:
                if item in alternatives:
                    alternatives[item] = alternatives[item] - lev
                else:
                    alternatives[item] = lev

        alternatives = list(filter(lambda a: a[1] < 2 * threshold, alternatives.items()))
        sorted(alternatives, key=lambda x: x[1])

        return list(map(lambda x: x[0], alternatives))
 def suggest(self, key, distance=3):
     suggestions = set()
     for index in self.indices:
         for candidate in index:
             if levenshtein(key, candidate) <= distance:
                 suggestions.add(index[candidate])
     return suggestions
示例#10
0
文件: lines.py 项目: nibrahim/lines
def patterns(f1, dist = 55, outlier=10):
    """Will partition elements into subsets. The elements of a subset will
    not have a Levenshtein distance of more than :dist: from the other
    members of the same subset
    """
    sets = []
    seen = set()
    for i in (x.strip() for x in f1 if x.strip()):
        if i in seen:
            continue
        s = set([i])
        seen.add(i)
        others = set(x.strip() for x in f1 if x.strip()) - seen
        for j in others:
            v = levenshtein(i, j) 
            if v <= dist:
                s.add(j)
                seen.add(j)
        sets.append(s)

    # Format for printing
    outlier /= 100.0
    retval = []
    total = len(list(x.strip() for x in f1 if x.strip()))
    for i in sets:
        l = float(len(i))
        if l/total < outlier:
            retval.append("{} elements - {}".format(len(i), i))
        else:
            retval.append("{} elements".format(len(i)))
    
    return retval
示例#11
0
def filter_brand_name(v, threshold, logger=None):
    # 如果地址以品牌名称开始
    """
    去除地址开头是品牌名称的情况
    :param record:
    :param threshold: Levenshtein距离的阈值
    :return:
    """
    record, modified = v
    logger = logging.getLogger() if logger is None else logger
    record = record.copy()

    addr = record[u'addr_e_rev'] if record[
        u'addr_e_rev'] is not None else record[u'addr_e']
    if addr is None:
        return record, modified
    addr_list = tuple(temp.strip() for temp in addr.split(u','))
    if len(addr_list) <= 1:
        return record, modified
    str1 = addr_list[0].lower()
    str2 = record[u'brandname_e'].strip().lower()
    dist = pylev.levenshtein(str1, str2)
    if dist < threshold:
        logger.info(
            unicode.format(u'{0} is similar to {1}, idstores={2}', addr,
                           record[u'brandname_e'], record[u'idstores']))
        record[u'addr_e_rev'] = u', '.join(addr_list[1:])
        modified = True
    return record, modified
示例#12
0
def _search_subnode(word, node_id, tolerance):
    word_from_id_query = "SELECT word FROM words WHERE id = {0};"
    get_children_query = """SELECT e.child_id, e.dist, w.word FROM edges e
        INNER JOIN words w ON e.child_id = w.id WHERE parent_id = {0};"""
    node_word = _perform_selection(word_from_id_query.format(node_id))[0][0]
    dist = levenshtein(word, node_word)
    result = set()
    if dist <= tolerance:
        result.add(node_word)
    children = _perform_selection(get_children_query.format(node_id))
    for child_id, child_parent_dist, child_word in children:
        child_word_dist = levenshtein(word, child_word)
        if child_parent_dist >= dist-tolerance and child_parent_dist <= dist+tolerance:
            result ^= _search_subnode(word, child_id, tolerance)

    return result
def link_author(author):
    global author_list
    for lab_author in author_list.keys():
        if pylev.levenshtein(lab_author, author) / max(len(lab_author),
                                                       len(author)) < .30:
            return author_list[lab_author]
    return author
示例#14
0
文件: command.py 项目: pmav99/clikit
def find_similar_command_names(
        name, commands):  # type: (str, CommandCollection) -> List[str]
    """
    Finds names similar to a given command name.
    """
    threshold = 1e3
    distance_by_name = {}

    # Include aliases in the search
    actual_names = commands.get_names(True)

    for actual_name in actual_names:
        # Get Levenshtein distance between the input and each command name
        distance = levenshtein(name, actual_name)

        is_similar = distance <= len(name) / 3
        is_sub_string = actual_name.find(name) != -1

        if is_similar or is_sub_string:
            distance_by_name[actual_name] = distance

        # Only keep results with a distance below the threshold
        distance_by_name = {
            k: v
            for k, v in distance_by_name.items() if v < 2 * threshold
        }

        # Display results with shortest distance first
        suggested_names = []
        for k, v in sorted(distance_by_name.items(), key=lambda _, v: v):
            if k not in suggested_names:
                suggested_names.append(k)

        return suggested_names
示例#15
0
def cached(a, b):
    if 'data' not in cached.__dict__:
        cached.data = {}
    k = (a,b)
    if k not in cached.data:
        cached.data[k] = pylev.levenshtein(a, b)
    return cached.data[k]
示例#16
0
def find_similar_names(name, names):  # type: (str, List[str]) -> List[str]
    """
    Finds names similar to a given command name.
    """
    threshold = 1e3
    distance_by_name = {}
    suggested_names = []

    for actual_name in names:
        # Get Levenshtein distance between the input and each command name
        distance = levenshtein(name, actual_name)

        is_similar = distance <= len(name) / 3
        is_sub_string = actual_name.find(name) != -1

        if is_similar or is_sub_string:
            distance_by_name[actual_name] = (
                distance,
                actual_name.find(name) if is_sub_string else float("inf"),
            )

    # Only keep results with a distance below the threshold
    distance_by_name = {
        k: v
        for k, v in distance_by_name.items() if v[0] < 2 * threshold
    }

    # Display results with shortest distance first
    for k, v in sorted(distance_by_name.items(),
                       key=lambda i: (i[1][0], i[1][1])):
        if k not in suggested_names:
            suggested_names.append(k)

    return suggested_names
示例#17
0
def patterns(f1, dist=55, outlier=10):
    """Will partition elements into subsets. The elements of a subset will
    not have a Levenshtein distance of more than :dist: from the other
    members of the same subset
    """
    sets = []
    seen = set()
    for i in (x.strip() for x in f1 if x.strip()):
        if i in seen:
            continue
        s = set([i])
        seen.add(i)
        others = set(x.strip() for x in f1 if x.strip()) - seen
        for j in others:
            v = levenshtein(i, j)
            if v <= dist:
                s.add(j)
                seen.add(j)
        sets.append(s)

    # Format for printing
    outlier /= 100.0
    retval = []
    total = len(list(x.strip() for x in f1 if x.strip()))
    for i in sets:
        l = float(len(i))
        if l / total < outlier:
            retval.append("{} elements - {}".format(len(i), i))
        else:
            retval.append("{} elements".format(len(i)))

    return retval
示例#18
0
 def get_best_match(self, response, tool_name, max_edition_percentage: Optional[float] = 0.1):
     # biotoolsID = response['list'][0]['biotoolsID']
     biotools_item = None
     normalized_name = self.normalize(tool_name)
     min_edit = len(tool_name) * 10000
     for item in response['list']:
         choice_edit = pylev.levenshtein(normalized_name, self.normalize(item['biotoolsID']))
         if choice_edit < min_edit:
             min_edit = choice_edit
             biotools_item = item
         choice_edit = pylev.levenshtein(normalized_name, self.normalize(item['name']))
         if choice_edit < min_edit:
             min_edit = choice_edit
             biotools_item = item
     if max_edition_percentage is not None and min_edit > len(tool_name) * max_edition_percentage:
         return None
     return biotools_item
示例#19
0
def levenshtein_worker(queue, results):
    while True:
        work = queue.get()
        (hash1, hash2, sector1, sector2, score) = work

        distance = pylev.levenshtein(sector1, sector2)
        results.put_nowait((hash1, hash2, distance, score))

        queue.task_done()
示例#20
0
def _is_duplicate(a: str, b: str) -> bool:
    """Determine whether two stacktraces are for the same error."""
    la = len(a)
    lb = len(b)
    diff = abs(la - lb)
    if diff > 50:
        return False
    denom = min(la, lb) + diff / 2
    ratio = levenshtein(a.casefold(), b.casefold()) / denom
    return ratio < 0.1
示例#21
0
 def selectSeries(self, series, allSeries):
     """The results the TVDB returns are sometimes poorly ranked and the first result is often not what we're looking
     for. This function attempts to find the closest match between a series named in the results and the user's
     search query by calculating the Levenshtein edit distance between the search query (series) and each of the
     results (allSeries) in order to find the result that most precisely matches our query
     """
     distances = []
     for show in allSeries:
         distances.append(pylev.levenshtein(series, show["seriesName"]))
     return allSeries[distances.index(min(distances))]
示例#22
0
    def evaluate_individual_sentence(self, original_sentence,
                                     paraphrase) -> Dict:

        original_sentence_tokens = nltk.word_tokenize(
            normalize_spaces_remove_urls(original_sentence))
        paraphrase_tokens = nltk.word_tokenize(
            normalize_spaces_remove_urls(paraphrase))

        # Bleu score
        bleu_score = nltk.translate.bleu_score.sentence_bleu(
            [normalize_spaces_remove_urls(original_sentence)],
            normalize_spaces_remove_urls(paraphrase))

        # Sentence embedding cosine similarity
        emb1 = self.model.encode(original_sentence)
        emb2 = self.model.encode(paraphrase)
        cos_sim = util.pytorch_cos_sim(emb1, emb2)

        # Levenshtein distance
        edit_distance = pylev.levenshtein(original_sentence_tokens,
                                          paraphrase_tokens)
        length = max(len(original_sentence_tokens), len(paraphrase_tokens))
        normalized_edit_distance = (length - edit_distance) / length

        # Jaccard
        jaccard = nltk.jaccard_distance(set(original_sentence_tokens),
                                        set(paraphrase_tokens))

        # Jaccard * cosine similarity
        jaccard_embedding_factor = jaccard * cos_sim.item()

        metrics = {
            'original_sentence':
            original_sentence,
            'paraphrase':
            paraphrase,
            'bleu_score':
            bleu_score,
            'normalized_original_sentence':
            normalize_spaces_remove_urls(original_sentence),
            'normalized_paraphrase':
            normalize_spaces_remove_urls(paraphrase),
            'embedding_cosine_similarity':
            cos_sim.item(),
            'edit_distance':
            edit_distance,
            'normalized_edit_distance':
            normalized_edit_distance,
            'jaccard':
            jaccard,
            'jaccard_embedding_factor':
            jaccard_embedding_factor
        }

        return metrics
示例#23
0
def planet_constellation(update, context):
    translator = Translator()
    text = update.message.text
    text = text.split()
    min_distance = 1000
    best_planet_choice = ''
    user_planet_in_text = ''
    for cur_word in text:
        for cur_planet in planet_list:
            if pylev.levenshtein(cur_word, cur_planet) < min_distance:
                min_distance = pylev.levenshtein(cur_word, cur_planet)
                best_planet_choice = cur_planet
                user_planet_in_text = cur_word
    full_name = find_constellation(best_planet_choice)
    full_name_ru = translator.translate(full_name,dest='russian', src='en').text
    if user_planet_in_text.upper() != best_planet_choice.upper():
        ans_text = f'Did you mean {best_planet_choice}? \n {full_name} / {full_name_ru}'
    else:
        ans_text = f'{full_name} / {full_name_ru}'
    update.message.reply_text(ans_text)            
def levenshtein(a, b):
    len_a = len(a)
    len_b = len(b)
    distance = pylev.levenshtein(a, b)
    try:
        maxLength = max(len_a, len_b)
        result = maxLength - distance
        percentage = (result / maxLength) * 100
        return percentage
    except:
        return 0
def compare(a, b):
    results = {
        'editdistance':
        editdistance.eval(a, b),
        'pylev':
        pylev.levenshtein(a, b),
        'python-Levenshtein':
        Levenshtein.distance(a, b),
        'pyxdameraulevenshtein':
        pyxdameraulevenshtein.damerau_levenshtein_distance(a, b),
    }
    return results
示例#26
0
    def search(self, query, threshold):
        d = levenshtein(self.value, query)
        if d <= threshold:
            yield self.value

        lo = d - threshold
        hi = d + threshold

        for dist, node in self.children.items():
            if lo <= dist <= hi:
                for rv in node.search(query, threshold):
                    yield rv
示例#27
0
def similarity(s1, s2):
    # Length considerations on/off
    if NO_LEN == 1:
        trunc_len = min(len(s1), len(s2), TRUNC)
    else:
        trunc_len = TRUNC

    # Truncate
    s1 = s1[:trunc_len]
    s2 = s2[:trunc_len]

    # Return the levenshtein distance between the two modified strings
    return pylev.levenshtein(s1, s2)
示例#28
0
def compare_to_gold_labels(entity: str, gold_entities: List[str],
                           para_id: int) -> str:

    if entity in gold_entities:
        return entity

    for gold_ent in gold_entities:
        if (pylev.levenshtein(entity, gold_ent) < 3):
            return gold_ent

    print(para_id)
    print(f"Cannot find {entity}")
    return entity
示例#29
0
def similarity(s1, s2):
  # Length considerations on/off
  if NO_LEN == 1:
    trunc_len = min(len(s1), len(s2), TRUNC)
  else:
    trunc_len = TRUNC

  # Truncate
  s1 = s1[:trunc_len]
  s2 = s2[:trunc_len]

  # Return the levenshtein distance between the two modified strings
  return pylev.levenshtein(s1, s2)
示例#30
0
def similarity_index_per_item(item1, item2):
	if type(item1)==str and type(item2)==str:
		return pylev.levenshtein(item1,item2)
	if ((type(item1)==int and type(item1)==int)
			or (type(item1)==float and type(item2)==float) 
			or (type(item1)==long and type(item2)==long)):
		return (abs((float)(item1 - item2)))/float(max([item1+1,item2+1]))
	if type(item1)==bool and type(item2)==bool:
		if item1 == item2:
			return 0
		else:
			return 1
	if (type(item1)==dict and type(item2)==dict) or (type(item1)==list and type(item2)==list):
		return 1-jaccard(item1,item2)
示例#31
0
def _iter_fuzzy_entries(catalog: Catalog,
                        search_key: Key) -> typ.Iterable[Entry]:
    for key in _iter_candidate_keys(catalog, search_key):
        msg_text_dist = pylev.levenshtein(key.msg_text, search_key.msg_text)
        src_line_dist = pylev.levenshtein(key.source_line,
                                          search_key.source_line)

        if msg_text_dist > FUZZY_MATCH_MAX_EDIT_DISTANCE_ABS:
            continue
        if src_line_dist > FUZZY_MATCH_MAX_EDIT_DISTANCE_ABS:
            continue

        msg_text_dist_pct = 100 * msg_text_dist / max(len(key.msg_text),
                                                      len(search_key.msg_text))
        src_line_dist_pct = (
            100 * src_line_dist /
            max(len(key.source_line), len(search_key.source_line)))

        if msg_text_dist_pct > FUZZY_MATCH_MAX_EDIT_DISTANCE_PCT:
            continue
        if src_line_dist_pct > FUZZY_MATCH_MAX_EDIT_DISTANCE_PCT:
            continue

        yield catalog[key]
示例#32
0
def main():
    if len(sys.argv) != 2:
        exit(f"Usage: {sys.argv[0]} filename")
    filename = sys.argv[1]
    outfile = 'out.txt'

    rows = []
    with open(filename) as fh:
        for row in fh:
            rows.append(row.rstrip("\n"))
    with open(outfile, 'w') as fh:
        for a in rows:
            for b in rows:
                dist = pylev.levenshtein(a, b)
                fh.write(f"{a},{b},{dist}\n")
示例#33
0
	def transform(self, question_list):
		q1_list = question_list[0]
		q2_list = question_list[1]
		
		lev_distance_strings = [[a,b] 
		for a,b in zip(q1_list, q2_list)]
		
		lev_dist_array = np.array([
	(float(levenshtein(pair[0], pair[1]))/
	(float(sum([x.count('') for x in pair[0]])) + 
	float(sum([x.count('') for x in pair[1]])))) 
	for pair in lev_distance_strings 
		])
		
		return lev_dist_array.reshape(len(lev_dist_array),1)
示例#34
0
def compare_to_gold_labels(participants, system_participants):
    ret = []
    found = False
    for p in participants:
        p = p.lower()
        if p in system_participants:
            ret.append(p)
            continue
        for g in system_participants:
            if (pylev.levenshtein(p,g) < 3):
                #print (p, "===", g)
                ret.append(g)
                found = True
        if not found:
            print(f"Cannot find {p}")
    return ret
示例#35
0
def get_similar_members():
    allowed_distance = int(request.args.get('distance')) or 4
    members = Member.query.all()
    similar = {'a': [], 'b': []}
    for left, right in itertools.combinations(members, 2):
        distance = pylev.levenshtein(left.name, right.name)
        if distance < allowed_distance:
            left_json = left.serialize
            left_json['pmts'] = len(left.payments)
            similar['a'].append(left_json)

            right_json = right.serialize
            right_json['pmts'] = len(right.payments)
            similar['b'].append(right_json)

    return render_template('similar.html', similar_members=similar)
示例#36
0
文件: cityres.py 项目: jopela/cityres
def choose_best(city, uris):
    """
    Chooses the string that most closely resemble to the city name.

    EXAMPLE
    =======
    >>> choose_best('Montreal',['http://dbpedia.org/resource/Montreal','http://dbpedia.org/resource/Westmount_(Montreal)'])
    'http://dbpedia.org/resource/Montreal'

    >>> choose_best('Montreal',['http://dbpedia.org/resource/Mountreal','http://dbpedia.org/resource/Moscow','http://dbpedia.org/resource/Montreal'])
    'http://dbpedia.org/resource/Montreal'

    >>> choose_best('New York',['http://dbpedia.org/resource/New_York_City','http://dbpedia.org/Harlem'])
    'http://dbpedia.org/resource/New_York_City'

    """

    # strategy is to use the longest common subsequence first and
    # take the the string that has the uri that has the longest one.
    # If there are ties, break the tie by computing the levenshtein and
    # taking the uri that has the smallest.

    # this creates a kind of band-pass filter, so to speak.

    distances = [(strdist.longest_sub_len(city, uri), uri) for uri in uris]

    # sort them by sub sequence length
    distances.sort()

    result_subseq_length = distances[-1][0]

    #print("distances",distances)

    ties = [e for e in distances if e[0] == result_subseq_length]

    #print("ties")

    # break the tie with the levenshtein distance.
    if len(ties) > 1:
        tie_distances = [(pylev.levenshtein(city, t[1]), t[1]) for t in ties]
        tie_distances.sort()
        result = tie_distances[0][1]
    else:
        result = distances[-1][1]

    return result
示例#37
0
文件: cityres.py 项目: jopela/cityres
def choose_best(city, uris):
    """
    Chooses the string that most closely resemble to the city name.

    EXAMPLE
    =======
    >>> choose_best('Montreal',['http://dbpedia.org/resource/Montreal','http://dbpedia.org/resource/Westmount_(Montreal)'])
    'http://dbpedia.org/resource/Montreal'

    >>> choose_best('Montreal',['http://dbpedia.org/resource/Mountreal','http://dbpedia.org/resource/Moscow','http://dbpedia.org/resource/Montreal'])
    'http://dbpedia.org/resource/Montreal'

    >>> choose_best('New York',['http://dbpedia.org/resource/New_York_City','http://dbpedia.org/Harlem'])
    'http://dbpedia.org/resource/New_York_City'

    """

    # strategy is to use the longest common subsequence first and
    # take the the string that has the uri that has the longest one.
    # If there are ties, break the tie by computing the levenshtein and
    # taking the uri that has the smallest.

    # this creates a kind of band-pass filter, so to speak.

    distances = [(strdist.longest_sub_len(city, uri), uri) for uri in uris]

    # sort them by sub sequence length
    distances.sort()

    result_subseq_length = distances[-1][0]

    #print("distances",distances)

    ties = [e for e in distances if e[0] == result_subseq_length]

    #print("ties")

    # break the tie with the levenshtein distance.
    if len(ties) > 1:
        tie_distances = [(pylev.levenshtein(city, t[1]),t[1]) for t in ties]
        tie_distances.sort()
        result = tie_distances[0][1]
    else:
        result = distances[-1][1]

    return result
示例#38
0
def score_domain(provided_ioc):
    """Return the scores of the provided domain."""
    score = 0

    for suspicious_tld in suspicious["tlds"]:
        if provided_ioc.endswith(suspicious_tld):
            score += 20

    try:
        res = tld.get_tld(provided_ioc,
                          as_object=True,
                          fail_silently=True,
                          fix_protocol=True)
        domain = ".".join([res.subdomain, res.domain])
    except Exception:
        domain = provided_ioc

    score += int(round(entropy.shannon_entropy(domain) * 50))
    domain = confusables.unconfuse(domain)
    words_in_domain = re.split("\W+", domain)

    if domain.startswith("*."):
        domain = domain[2:]

        if words_in_domain[0] in ["com", "net", "org"]:
            score += 10

    for word in suspicious["keywords"]:
        if word in domain:
            score += suspicious["keywords"][word]

    for key in [k for k, v in suspicious["keywords"].items() if v >= 70]:
        for word in [
                w for w in words_in_domain
                if w not in ["email", "mail", "cloud"]
        ]:
            if pylev.levenshtein(str(word), str(key)) == 1:
                score += 70

    if "xn--" not in domain and domain.count("-") >= 4:
        score += domain.count("-") * 3

    if domain.count(".") >= 3:
        score += domain.count(".") * 3
    return score
示例#39
0
def uniqify(corpus, occ_dict, distance):
	# augment with value counts (which one to keep)
	words = []
	while corpus:
		center = corpus[0]
		related = [word for word in corpus if pylev.levenshtein(center, word) <= distance]
		tuples = [(word, occ_dict[word.title()]) for word in related]
		sorted_ts = sorted(tuples, key=lambda x: x[1], reverse=True)
		print(sorted_ts)
		winner = sorted_ts[0][0]
		print(corpus)
		for t in sorted_ts:
			print(t)
			corpus.remove(t[0])
		# keep taluk with highest number of occurrences
		# create dict by taking difference between corpae
		words.append(winner)
	return [x.title() for x in words]
示例#40
0
def _connect_word_to_tree(word):
    last_id_query = "SELECT MAX(id) AS max_id FROM words;"
    word_from_id_query = "SELECT word FROM words WHERE id = {0};"
    find_child_at_dist_query = """SELECT child_id FROM edges 
        WHERE parent_id = {0} AND dist = {1};"""
    connect_to_tree_query = """INSERT INTO edges (parent_id, child_id, dist) 
        VALUES ({0}, {1}, {2});"""
    root_id = 1
    child_id = ['initial_id']
    node_id = root_id
    word_id = _perform_selection(last_id_query)[0][0]

    while len(child_id) > 0:
        node_word = _perform_selection(word_from_id_query.format(node_id))[0][0]
        dist = levenshtein(word, node_word)
        child_id = _perform_selection(find_child_at_dist_query.format(node_id, dist))
        if len(child_id) > 0:
            node_id = child_id[0][0]

    _perform_insertion(connect_to_tree_query.format(node_id, word_id, dist))
示例#41
0
 def insert(self, string):
     dist = levenshtein(string, self.value)
     if dist not in self.children:
         self.children[dist] = Node(string)
         return
     self.children[dist].insert(string)
示例#42
0
def assoc_sites_districts():
    """
    Associates orphan sites with districts either through fuzzy matching
    or creating schools as intermediaries between sites and districts.
    """

    with current_app.app_context():
        g.db_session = create_db_session()
        from orvsd_central.models import Site, School, District
        from orvsd_central.util import create_school_by_district_site
        from collections import namedtuple
        import pylev

        orphan_sites = set(Site.query.filter_by(school_id=None).all())
        assigned_sites = set()
        schools = School.query.all()
        num_matches = 0
        match_tuple = namedtuple('match', ['id', 'name'])
        match = None

        # If a site belongs to more than 1 school, just default to creating
        # by a district.

        print 'School Matching:'
        for site in orphan_sites:
            for school in schools:
                # Check for names as subsets or <=3 levenshtein distance.
                if (site.name in school.name or school.name in site.name or
                        pylev.levenshtein(site.name, school.name) <= 3):
                    num_matches += 1
                    match = match_tuple(id=school.id, name=school.name)
            if num_matches == 1:
                print ('School: {0} and Site: {1} matched.'
                       .format(match.name, site.name))
                site.school_id = match.id
                assigned_sites.add(site)
            num_matches = 0
            match = None

        g.db_session.commit()
        orphan_sites = orphan_sites - assigned_sites

        print '\nDistrict Matching: '

        # Districts next, with anything that's left.
        districts = District.query.all()
        for site in orphan_sites:
            for district in districts:
                if site.name in district.name or district.name in site.name:
                    print ('District: {0} or Site: {1} contained in the other.'
                           .format(district.name, site.name))
                    school = create_school_by_district_site(district, site)
                    site.school_id = school.id
                    assigned_sites.add(site)
                    break

                # Use Levenshtein Distance for fuzzy matching
                elif pylev.levenshtein(site.name, school.name) <= 3:
                    print ('District: {0} and Site: {1} fuzzy matched.'
                           .format(district.name, site.name))
                    school = create_school_by_district_site(district, site)
                    site.school_id = school.id
                    assigned_sites.add(site)
                    break

        g.db_session.commit()
        orphan_sites = orphan_sites - assigned_sites

        print '\nRemaining Sites: '
        print '\t' + '\n\t'.join((site.name for site in orphan_sites))
示例#43
0
文件: tests.py 项目: tauhid12/pylev
 def test_painful(self):
     # This is much faster than the above.
     self.assertEqual(pylev.levenshtein('CUNsperrICY', 'conspiracy'), 8)
示例#44
0
    a = g.replace('"', '').replace("/", ' ').replace("-", " ").strip()
    if a not in SIRS:
        temp1 = one_ave(a.lower(), pattern, "av")
        gtfs_terms.append(temp1)
        orig_gtfs.append(g)

f2.close()

bestmatches = {} #Where we'll store matches.

#Compare each station in the turnstile data to each station in the gtfs feed. 
for t in xrange(0, len(turn_terms)):
    for g in xrange(0, len(gtfs_terms)):
       
	#Compute distance:
        tinylist = [int(distanceoffset(turn_terms[t], gtfs_terms[g])) + int(pylev.levenshtein(gtfs_terms[g], turn_terms[t])) + int(isinside(turn_terms[t], gtfs_terms[g])) + int(samewords(turn_terms[t], gtfs_terms[g])) + int(penalize(gtfs_terms[g], turn_terms[t])), orig_gtfs[g], gtfs_terms[g], orig_turn[t]]
        
	#Make the highest default so anything better will take its place.   
        bestmatches.setdefault(turn_terms[t], [len(turn_terms[t])])
        r_best.setdefault(g, [len(gtfs_terms[g])])

	#Check against previous, update if it's a better match for both words than the things they matched before.
        if tinylist[0] < bestmatches[turn_terms[t]][0] and tinylist[0] < r_best[g]:
            bestmatches[turn_terms[t]] = tinylist
            r_best[g] = [tinylist[0], turn_terms[t]]
#            print turn_terms[t], tinylist
#            if "av n" in turn_terms[t]:
#                print bestmatches[turn_terms[t]], tinylist

f3 = open('./matchtable.txt', 'w') #Now stick it all in a nice file.
示例#45
0
文件: tester.py 项目: iamyaro/wikipy
import pylev
import editdistance
import distance



print pylev.levenshtein('abc', '123abc567')
#print editdistance.eval('abc', 'abc')
示例#46
0
文件: tests.py 项目: tauhid12/pylev
 def test_long(self):
     self.assertEqual(pylev.levenshtein('confide', 'deceit'), 6)
示例#47
0
文件: tests.py 项目: tauhid12/pylev
 def test_classic(self):
     self.assertEqual(pylev.levenshtein('kitten', 'sitting'), 3)
示例#48
0
B.add_node("Dummy1", demand = 1)
B.add_node("Dummy2", demand = 1)

turn_terms.append("Dummy1")
turn_terms.append("Dummy2")

f2.close()

bestmatches = {}
sawts = {}


for t in turn_terms:		
    for g in google_terms:      
         #Compute distance with levenshtein and numbers
        distance = int(pylev.levenshtein(g,t)) + int(distanceoffset(t, g)) + int(isinside(t, g)) 
# int(samewords(t, g))
        #turnstrings = orig_google[g], google_terms[g], orig_turn[t]
        B.add_edge(g, t, weight = distance)
        #if distance < 3:
        #    print "google = ", g, "turn = ", t, "distance = ",  distance
#print B.number_of_edges()

p_match = []
c = list(B.edges()) #(< probably don't need)
for (n1, n2) in c:
    if B.edge[n1][n2]['weight'] <= 0:
        B.remove_edge(n1, n2)
        p_match.append((n1,n2))
    #otherwise print out top five matches
示例#49
0
f1.close()
f2.close()


perfectmatches = {}
bestmatches = {}
nextbestmatches = {}

#Compare every station in the turnstile feed with every station in the google feed. 

for g_station in gtfs_terms:		
    for ts_station in ts_terms:	
	turnstile = wordnospaces(ts_station)
        google = wordnospaces(g_station)
        if pylev.levenshtein(turnstile, google) == 0: 	#If the distance is 0, we have a perfect match!
            tinylist1 = [0, ts_station]
            perfectmatches[g_station] = tinylist1
            break
        else:
            bestmatches.setdefault(g_station, [len(g_station)])
            nextbestmatches.setdefault(g_station, [len(g_station)])
            tinylist = [int(distanceoffset(ts_station, g_station)) + int(pylev.levenshtein(turnstile, google)), ts_station]

            if tinylist[0] < bestmatches[g_station][0]:
                nextbestmatches[g_station] = bestmatches[g_station]
                bestmatches[g_station] = tinylist


f3 = open('./matchtable.txt', 'w')
for p in perfectmatches:
示例#50
0
                print "DEG read: " + a + " " + b
                print rev_comp(a+b)
                print "SRA"
            # Given a seed, for every seed-matching pair of DEG and SRA, do pairwise alignment 
            for k in seeds[i]:
                if DEBUG:
                    print k.seq
                l = len(k.seq)
                # s1: seq from around DEG sites
                ab = a + b
                s1 = rev_comp(ab)[0: l]
                # s2: seq from SRA (excluding the first pos, i.e. the 1st position does not matter)
                s2 = k.seq
                # print "s1: " + s1
                # print "s2: " + s2
                ed = pylev.levenshtein(s1[10:], s2[10:])
                if ed <= 100:
                    if DEBUG:
                        print "DEG:" + a + b
                        print "s1 from DEG: " + s1[0] + "|" + s1[1:10] + "|" + s1[10:]
                        print "s2 from SRA: " + s2[0] + "|" + s2[1:10] + "|" + s2[10:]
                        print "ed_x_pos1: " + str(ed)
                    # Only do alignment for the rest of the seq (i.e. ignore the 1st position and the seed region ( 2-10, or [1, 10) )
                    # since they are supposed to be perfectly matched
                    # alignments = pairwise2.align.globalxx(s1[10:], s2[10:])
                    # emphasize g10-g21
                    # m: A match score is the score of identical chars, otherwise mismatch score.

                    # s: Same open and extend gap penalties for both sequences.
                    # d: The sequences have different open and extend gap penalties.
                    # alignments = pairwise2.align.globalms(s1[10:], s2[10:], 2, -1, -4, -1)
示例#51
0
def levenshteinIndex(str1,str2):
	distance = pylev.levenshtein(str1,str2)
	return (1-(float)(distance)/max([len(str1),len(str2)]))
示例#52
0
#    print ts_terms[v], orig_ts[v]

f1.close()
f2.close()

bestmatches = {}
sawts = {}

#Compare each station in the turnstile data to each station in the google feed. 
for t in xrange(0, len(turn_terms)):		
    for g in xrange(0, len(google_terms)):
        #Make the highest default so anything better will take its place.
        bestmatches.setdefault(turn_terms[t], [len(turn_terms[t])])
            
            #Compute distance with levenshtein and numbers
        tinylist = [int(distanceoffset(turn_terms[t], google_terms[g])) + int(pylev.levenshtein(google_terms[g], turn_terms[t])) + isinside(turn_terms[t], google_terms[g]) + samewords(turn_terms[t], turn_terms[t]), orig_google[g], google_terms[g], orig_turn[t]]

        if tinylist[0] < bestmatches[turn_terms[t]][0]:
            bestmatches[turn_terms[t]] = tinylist

f3 = open('./matchtable.txt', 'w')

#for g in xrange(0, len(bestmatches)):
#    for x in xrange(0, len(g)):
#        print 
for g in bestmatches:
    f3.write(g + ",")
    for x in xrange(0, len(bestmatches[g])):
        if x == len(bestmatches[g])-1:
            f3.write(str(bestmatches[g][x]).strip())
        else:
示例#53
0
def levenshtein_ratio(str_one, str_two):
  """
  Levenshtein ratio
  """
  str_len = len(str_one + str_two)
  return (str_len - pylev.levenshtein(str_one, str_two)) / float(str_len)
示例#54
0
#    print ts_terms[v], orig_ts[v]

f1.close()
f2.close()

bestmatches = {}
sawts = {}

#Compare each station in the turnstile data to each station in the google feed. 
for g in xrange(0, len(gtfs_terms)):		
    for t in xrange(0, len(ts_terms)):
        #Make the highest default so anything better will take its place.
        bestmatches.setdefault(gtfs_terms[g], [len(gtfs_terms[g])])
            
            #Compute distance with levenshtein and numbers
        tinylist = [int(distanceoffset(ts_terms[t], gtfs_terms[g])) + int(pylev.levenshtein(gtfs_terms[g], ts_terms[t])) + isinside(ts_terms[t], gtfs_terms[g]) + samewords(ts_terms[t], gtfs_terms[g]), orig_gtfs[g], ts_terms[t], orig_ts[t]]

        if tinylist[0] < bestmatches[gtfs_terms[g]][0]:
            bestmatches[gtfs_terms[g]] = tinylist

f3 = open('./matchtable2.txt', 'w')

print bestmatches
#for g in xrange(0, len(bestmatches)):
#    for x in xrange(0, len(g)):
#        print 
#for g in bestmatches:
#    f3.write(g + ",")
#    for x in xrange(0, len(bestmatches[g])):
#        if x == len(bestmatches[g])-1:
#            f3.write(str(bestmatches[g][x]).strip())
示例#55
0
文件: tests.py 项目: tauhid12/pylev
 def test_same(self):
     self.assertEqual(pylev.levenshtein('kitten', 'kitten'), 0)
def get_sense(word, lang=u"pl_PL"):
    senses = get_senses(word, lang)
    counter[0] += 1
    if counter[0] % 100 == 0:
        print "sense", counter[0]
    return min(senses, key=lambda x: pylev.levenshtein(x, word)) if senses else None
示例#57
0
文件: tests.py 项目: tauhid12/pylev
 def test_empty(self):
     self.assertEqual(pylev.levenshtein('', ''), 0)
示例#58
0
def penalize(string1, string2):
    if pylev.levenshtein(string1, string2) > min(len(string1), len(string2)):
        return 3
    return 0
 def apply_lev(self, threshold):
     if self.proposals:
         for value in self.proposals.keys():
             if pylev.levenshtein(value, self.goal) > threshold:
                 del self.proposals[value]