def topicsPrint(URL, filtered_content):
	headers = {'Accept':'application/vnd.github.mercy-preview+json', 'Authorization': [USERNAME]}
	req = requests.get(URL, headers = headers)
	reqJSON = req.json()
	content = reqJSON['names']
	if req.status_code == requests.codes.ok or len(content) == 0:

		scores = {}

		for x in range(len(content)):
			scores[content[x]] = textdistance.ratcliff_obershelp(filtered_content, content[x])


		return(max(scores.items(), key=operator.itemgetter(1))[0])

	else:
		print('Content was not found')
		content = []

		currContent = ""

		while currContent != "exit":
			currContent = input("Enter a topic for this repo or 'exit' to submit current suggestions: ")
			content.append(currContent)
		content.remove("exit")

		scores = {}

		for x in range(len(content)):
			scores[content[x]] = textdistance.ratcliff_obershelp(filtered_content, content[x])

		print(scores)

		return(max(scores.items(), key=operator.itemgetter(1))[0])
Пример #2
0
def filter_blends(word, first_blend, second_blend, limit="1237"):
    refined_first_blend, refined_second_blend = [], []
    for w in first_blend:
        qua1 = metric.qualified("jw_sim", textdistance.jaro_winkler(word, w),
                                True, False)
        qua2 = metric.qualified(
            "levenshtein_sim",
            textdistance.levenshtein.normalized_similarity(word, w), True,
            False)
        qua3 = metric.qualified("ro_sim",
                                textdistance.ratcliff_obershelp(word, w), True,
                                False)
        qua4 = metric.qualified("needleman_wunsch",
                                textdistance.needleman_wunsch(word, w), True,
                                False)
        qua5 = metric.qualified("smith_waterman",
                                textdistance.smith_waterman(word, w), True,
                                False)
        qua6 = metric.qualified("gotoh", textdistance.gotoh(word, w), True,
                                False)
        qua7 = metric.qualified("strcmp95", textdistance.strcmp95(word, w),
                                True, False)
        metric_pool = [qua1, qua2, qua3, qua4, qua5, qua6, qua7]
        statis = True
        for i in limit:
            statis &= metric_pool[int(i) - 1]
        if statis:
            refined_first_blend.append(w)

    for w in second_blend:
        qua1 = metric.qualified("jw_sim", textdistance.jaro_winkler(word, w),
                                False, True)
        qua2 = metric.qualified(
            "levenshtein_sim",
            textdistance.levenshtein.normalized_similarity(word, w), False,
            True)
        qua3 = metric.qualified("ro_sim",
                                textdistance.ratcliff_obershelp(word, w),
                                False, True)
        qua4 = metric.qualified("needleman_wunsch",
                                textdistance.needleman_wunsch(word, w), False,
                                True)
        qua5 = metric.qualified("smith_waterman",
                                textdistance.smith_waterman(word, w), False,
                                True)
        qua6 = metric.qualified("gotoh", textdistance.gotoh(word, w), False,
                                True)
        qua7 = metric.qualified("strcmp95", textdistance.strcmp95(word, w),
                                False, True)
        metric_pool = [qua1, qua2, qua3, qua4, qua5, qua6, qua7]
        statis = True
        for i in limit:
            statis &= metric_pool[int(i) - 1]
        if statis:
            refined_second_blend.append(w)

    return refined_first_blend, refined_second_blend
Пример #3
0
def valid_song(*input):
    names = [x.lower().strip() for x in input]
    minimum_score = (0.5, 0.5)  # Track, Artist

    score1 = ratcliff_obershelp(names[0], names[1])
    score2 = ratcliff_obershelp(names[2], names[3])

    # print(names[0], "and", names[1], " got a score of ", score1)
    # print(names[2], "and", names[3], " got a score of ", score2)

    return score1 >= minimum_score[0] and score2 >= minimum_score[1]
Пример #4
0
def add_query_features(df, inc, exc, k1list, k2list):
    """
    Return a copy of a dataframe with summary features added for
    the named text files defining the query
    """
    df_new = df.copy()
    k1lens = list(map(len, k1list))
    k2lens = list(map(len, k2list))
    k1max = max(k1lens)
    k2max = max(k2lens)
    k1count = len(k1list)
    k2count = len(k2list)
    df_new['k1_count'] = k1count
    df_new['k2_count'] = k2count
    df_new['k1_max'] = k1max
    df_new['k2_max'] = k2max
    jaro_dist = jellyfish.jaro_distance(inc, exc)
    lev_dist = jellyfish.levenshtein_distance(inc, exc)
    ji = textdistance.jaccard(inc, exc)
    sd = textdistance.sorensen(inc, exc)
    ro = textdistance.ratcliff_obershelp(inc, exc)
    #jellyfish.damerau_levenshtein_distance(inc,exc)
    #jellyfish.jaro_winkler(inc,exc)
    df_new['inc_jaro_exc'] = jaro_dist
    df_new['inc_lev_exc'] = lev_dist
    df_new['inc_ji_exc'] = ji
    df_new['inc_sd_exc'] = sd
    df_new['inc_ro_exc'] = ro
    return df_new
    def knn_search(self, word):
        """[Pesquisa uma palavra próxima no vocabulário utilizado a lógica do KNN]

        Arguments:
            word {[string]} -- [Palavra de entrada]

        Returns:
            [int] -- [Índice da palava mais próxima no vocabulário]
        """
        dist = list()
        for token in self.vocabulary:
            if self.algorithm == "levenshtein":
                dist.append(
                    textdistance.levenshtein.normalized_similarity(
                        word, token))

            elif self.algorithm == "jaccard":
                dist.append(textdistance.jaccard(word, token))

            elif self.algorithm == "ratcliff_obershelp":
                dist.append(textdistance.ratcliff_obershelp(word, token))

        index = dist.index(max(dist))

        return self.embeds[index]
    def search(self, word, limit=0.7):
        """[Pesquisa palavra próxima no vocabulário utilizando um valor de aceitação de distancia.]

        Arguments:
            word {[string]} -- [Palavra de entrada]

        Keyword Arguments:
            limit {float} -- [Valor de distancia para aceitação de palavra (0,1)] (default: {0.7})

        Returns:
            [new_embed] -- []
        """
        dist = list()
        for index, token in enumerate(self.words):
            if self.algorithm == "levenshtein":
                distance = textdistance.levenshtein.normalized_similarity(
                    word, token)
                if distance > limit:
                    return self.embeds[index]
                dist.append(distance)
            elif self.algorithm == "jaccard":
                distance = textdistance.jaccard(word, token)
                if distance > limit:
                    return self.embeds[index]
                dist.append(distance)
            elif self.algorithm == "ratcliff_obershelp":
                distance = textdistance.ratcliff_obershelp(word, token)
                if distance > limit:
                    return self.embeds[index]
                dist.append(distance)
        index = dist.index(max(dist))
        return self.embeds[index]
Пример #7
0
    def compare(self, statement, other_statement):
        # Get the lowercase version of both strings
        statement_text = str(statement.text.lower())
        other_statement_text = str(other_statement.text.lower())

        result = ratcliff_obershelp(statement_text, other_statement_text)
        # Return your calculated value here
        return result
Пример #8
0
 def criteria_features(x, col):
     raw_text = x[col].lower()
     jd = jellyfish.jaro_distance(raw_text, crit)
     ld = jellyfish.levenshtein_distance(raw_text, crit)
     ji = textdistance.jaccard(raw_text, crit)
     sd = textdistance.sorensen(raw_text, crit)
     ro = textdistance.ratcliff_obershelp(raw_text, crit)
     return jd, ld, ji, sd, ro
Пример #9
0
 def sm_features(x, col1, col2):
     if (x[col1] != x[col1]) or (x[col2] != x[col2]):
         ro = np.nan
     else:
         raw_text1 = x[col1].lower()
         raw_text2 = x[col2].lower()
         ro = textdistance.ratcliff_obershelp(raw_text1, raw_text2)
     return ro
def EvalSequences(df):
    df["seq_score"] = df[["ST_stemmed", "Item_stemmed"]].apply(
        lambda x: textdistance.ratcliff_obershelp(*x), axis=1)
    df.seq_score = df.seq_score * 100
    df = df[(df.seq_score < 40)]
    df.reset_index(drop=True, inplace=True)
    print("eval sequences done")
    return df
Пример #11
0
def RatcliffObershelp(str1, match_against):
    best_match = ['', 0]
    str_comparison = [[x, textdistance.ratcliff_obershelp(str1, x)]
                      for x in match_against]
    for item in str_comparison:
        if item[1] > best_match[1]:
            best_match = item

    return best_match
Пример #12
0
def map_marie(input_data,
              target_data,
              input_bert_weights,
              target_bert_weights,
              string_match='edit',
              alpha=0.8,
              bert_layers=1,
              top_n=5):
    inp_txt2idx, inp_idx2txt = _create_txt2idx(input_data)
    tgt_txt2idx, tgt_idx2txt = _create_txt2idx(target_data)

    inp_bert_vectors = _get_bert_vectors(input_bert_weights, input_data,
                                         inp_txt2idx, bert_layers)
    tgt_bert_vectors = _get_bert_vectors(target_bert_weights, target_data,
                                         tgt_txt2idx, bert_layers)

    mapper = dict()

    for cnt, (inp_txt, inp_idx) in enumerate(inp_txt2idx.items()):
        inp_bert_vector = inp_bert_vectors[inp_idx]
        cal_dist = []

        cos_dist = [
            alpha * (_cal_cosine(inp_bert_vector, tgt_bert_vectors[tgt_idx]))
            for tgt_txt, tgt_idx in tgt_txt2idx.items()
        ]

        if string_match == 'edit':
            str_match = [
                (1 - alpha) * (1 - editdistance.eval(inp_txt, tgt_txt) /
                               max(len(inp_txt), len(tgt_txt)))
                for tgt_txt, tgt_idx in tgt_txt2idx.items()
            ]
        if string_match == 'jaccard':
            str_match = [(1 - alpha) * textdistance.jaccard(inp_txt, tgt_txt)
                         for tgt_txt, tgt_idx in tgt_txt2idx.items()]
        if string_match == 'ob':
            str_match = [(1 - alpha) *
                         textdistance.ratcliff_obershelp(inp_txt, tgt_txt)
                         for tgt_txt, tgt_idx in tgt_txt2idx.items()]

        ord2idx = [tgt_idx for _, tgt_idx in tgt_txt2idx.items()]

        cal_dist = np.add(cos_dist, str_match)
        topn_ord_idx = cal_dist.argsort()[::-1][:top_n]

        mapper[inp_idx] = [(ord2idx[idx], cal_dist[idx])
                           for idx in topn_ord_idx]

        if cnt % 100 == 0:
            print("...Processed %i mappings" % (cnt))

    return mapper, inp_idx2txt, tgt_idx2txt
Пример #13
0
def is_brand(word: str, ignore_keywords: list, score=0.70):
    """
    Function to check brand.
    Using Ratcliff-Obershelp similarity
    Inp:
     score: Ratcliff-Obershelp score lager than score
    Return: True mean that it is brand
    """
    for ig in ignore_keywords:
        ig = ig.lower()
        if compare(word, ig) > score * 100 and textdistance.ratcliff_obershelp(
                ig, word) > score:
            return True
    return False
def ratcliff_obershelp_sim_of_blends():
    blend1, blend2 = [], []
    count = 0
    with open("data/blends.txt", 'r') as f:
        for line in f:
            s = line.split()
            origin, first, second = s[0], s[1], s[2]
            blend1.append(textdistance.ratcliff_obershelp(origin, first))
            blend2.append(textdistance.ratcliff_obershelp(origin, second))
            count += 1
    #print(textdistance.jaro_winkler())

    x = np.array([i for i in range(count)])
    y1 = np.array(blend1)
    y2 = np.array(blend2)

    plt.plot(x, y1, color="r", linestyle="-", marker="^", linewidth=1)
    plt.plot(x, y2, color="b", linestyle="-", marker="s", linewidth=1)

    plt.xlabel("x")
    plt.ylabel("y")
    plt.title("ratcliff-obershelp similarity", fontsize=12, color='g')
    print("# first blend: 0.35 ~ 0.85\nsecond blend: 0.45 ~ 0.95")
    plt.show()
Пример #15
0
def get_skill_header(headers, skills):
    list_of_skills = list()
    for header in headers:
        skills_weight = list()
        for No_of_skill in range(len(skills)):
            skills_weight.append(
                (header,
                 textdistance.ratcliff_obershelp(header, skills[No_of_skill])))
            #if reached the end of skills sort them and get the highest prob.only
            if No_of_skill == len(skills) - 1:
                list_of_skills.append(max(skills_weight, key=lambda x: x[1]))

    skill_element = max(list_of_skills, key=lambda x: x[1])
    skill = skill_element[0]
    headers.remove(skill)
    return skill
Пример #16
0
def concordance_search(tm_objects, searchCon, matchRate, search_lang):
    # normalized_levenshtein = NormalizedLevenshtein()
    out_sequences = []
    q_tokens = removeStopwords(searchCon).split()
    for tm_object in tm_objects:
        tm_url = os.path.join(settings.MEDIA_ROOT,
                              getattr(tm_object, 'file_url').name)
        tm_s_lang = getattr(tm_object, 's_lang')
        tm_t_lang = getattr(tm_object, 't_lang')
        tm_name = getattr(tm_object, 'name')
        if os.path.isfile(tm_url):
            fin = open(tm_url, 'rb')
            tmx_file = tmxfile(fin, tm_s_lang, tm_t_lang)
            for node in tmx_file.unit_iter():
                sequence = node.getsource()
                s_tokens = removeStopwords(sequence).split()
                average_rate = 0
                index_list = []
                ordering = False
                for q_token in q_tokens:
                    q_index = s_tokens.index(
                        q_token) if q_token in s_tokens else -1
                    if q_index == -1:
                        matched = difflib.get_close_matches(q_token,
                                                            s_tokens,
                                                            n=1,
                                                            cutoff=0.85)
                        if len(matched) > 0:
                            average_rate += float(
                                textdistance.ratcliff_obershelp(
                                    q_token, matched[0]))
                    else:
                        average_rate += 1
                        index_list.append([q_token, q_index])
                average_rate = int(average_rate /
                                   max(len(s_tokens), len(q_tokens)) * 100)
                if average_rate >= matchRate:
                    out_sequences.append({
                        'source': sequence,
                        'target': node.gettarget(),
                        'tm_name': tm_name,
                        'match_rate': average_rate
                    })
    out_sequences.sort(key=compare_matchrate, reverse=True)
    return out_sequences
Пример #17
0
def get_otherHeaders(headers, otherHeaders):
    list_of_chosen_Headers = set()
    list_of_otherHeaders = list()
    for header in headers:
        otherHeaders_weight = list()
        for No_of_otherH in range(len(otherHeaders)):
            otherHeaders_weight.append(
                (header,
                 textdistance.ratcliff_obershelp(header,
                                                 otherHeaders[No_of_otherH])))
            if No_of_otherH == len(otherHeaders) - 1:
                list_of_otherHeaders.append(
                    max(otherHeaders_weight, key=lambda y: y[1]))

    list_of_otherHeaders.sort(key=lambda y: y[1], reverse=True)
    for header in list_of_otherHeaders:
        if header[1] > 0.7:
            list_of_chosen_Headers.add(header[0])

    return list_of_chosen_Headers
Пример #18
0
def similarity(type, a, b):
    """
  String similarity metrics
  input:  type: hamming (similarity type)
          a: John (string 1)
          b: John Snow (string 2)
  output: 0.73 (probability)
  """
    if type == 'hamming':
        return textdistance.hamming.normalized_similarity(a, b)
    elif type == 'levenshtein':
        return textdistance.levenshtein.normalized_similarity(a, b)
    elif type == 'jaro_winkler':
        return textdistance.jaro_winkler(a, b)
    elif type == 'jaccard':
        tokens_1 = a.split()
        tokens_2 = b.split()
        return textdistance.jaccard(tokens_1, tokens_2)
    elif type == 'sorensen':
        tokens_1 = a.split()
        tokens_2 = b.split()
        return textdistance.sorensen(tokens_1, tokens_2)
    elif type == 'ratcliff_obershelp':
        return textdistance.ratcliff_obershelp(a, b)
 def comparar_textos(self, str1, str2, metodo):
     was_found = False
     if metodo == "jaccard":
         similaridade = 100 * textdistance.jaccard(str1, str2)
         was_found = True
     elif metodo == "levenshtein":
         if len(str1.split()) > 1 or len(str2.split()) > 1:
             print(
                 "A similaridade pelo método de Levenshtein pode comparar apenas palavras, não textos"
             )
         else:
             str1, str2 = str1.lower(), str2.lower()
             similaridade = 100 * textdistance.levenshtein.normalized_similarity(
                 str1, str2)
             was_found = True
     elif metodo == "ratcliff_obershelp":
         similaridade = 100 * textdistance.ratcliff_obershelp(str1, str2)
         was_found = True
     else:
         print("O método informado não está implementado!")
     if was_found:
         print(
             "A similaridade pelo método %s entre os 2 textos informados é de %.2f %%"
             % (metodo, similaridade))
Пример #20
0
def fakeTextDetect(request, format=None):
    print(request.data)
    serializer = TextSerializer(data=request.data)
    if serializer.is_valid():

        all_data = FakeText.objects.all()
        max_similarity = -1
        feedback_1 = ""
        feedback_2 = ""
        for data in all_data:
            similarity = textdistance.ratcliff_obershelp(
                serializer.data['fake_text'], data.fake_text)
            if max_similarity < similarity:
                max_similarity = similarity
                feedback_1 = data.feedback_one
                feedback_2 = data.feedback_two

        print(max_similarity)

        #if text is more than 50% similar then
        if max_similarity * 100 > 50:

            #Most likely fake news.
            content = {
                'Description': 'Strong Likeley hood of fake news.',
                'Feedback_1': feedback_1,
                'Feedback_2': feedback_2
            }
            return Response(content, status=status.HTTP_200_OK)

        #Less likely fake news
        content = {'Description': 'Less Likeley hood of fake news.'}
        return Response(content, status=status.HTTP_200_OK)

    print(serializer.errors)
    return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
    lns = []
    diff = abs(len(smiles) - len(atp))
    lns.append(name)
    lns.append(nl)
    lns.append(i[-1])
    lns.append(nl)
    lns.append(dl)
    lns.append(str(td.damerau_levenshtein(smiles, atp)))
    lns.append(nl)
    lns.append(lev)
    lns.append(str(td.levenshtein(smiles, atp)))
    lns.append(nl)
    lns.append(over)
    lns.append(str(td.overlap(smiles, atp)))
    lns.append(nl)
    lns.append(lcsseq)
    lns.append(str(len(td.lcsseq(smiles, atp))))
    lns.append(nl)
    lns.append(lcsstr)
    lns.append(str(len(td.lcsstr(smiles, atp))))
    lns.append(nl)
    lns.append(gest)
    lns.append(str(td.ratcliff_obershelp(smiles, atp)))
    lns.append(nl)
    lns.append(nl)
    lns.append('________________________________')
    lns.append(nl)
    res.writelines(lns)

res.close()
Пример #22
0
def Seq_StringDistance(str_seq, str_ref, method="hamming"):

    if (method is "hamming"):
        return [
            textdistance.hamming(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "levenshtein"):
        return [
            textdistance.levenshtein(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "damerau_lev"):
        return [
            textdistance.damerau_levenshtein(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "j-winkler"):
        return [
            textdistance.jaro_winkler(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "smith-waterman"):
        return [
            textdistance.smith_waterman(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "jaccard"):
        return [
            textdistance.jaccard(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "sorensen-dice"):
        return [
            textdistance.sorensen_dice(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "tversky"):
        return [
            textdistance.tversky(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "tanimoto"):
        return [
            textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "cosine"):
        return [
            textdistance.cosine(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "tanimoto"):
        return [
            textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq
        ]

    elif (method is "ratcliff"):
        return [
            textdistance.ratcliff_obershelp(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]

    elif (method is "bwt"):
        return [
            textdistance.bwtrle_ncd(str_seq_i, str_ref)
            for str_seq_i in str_seq
        ]
Пример #23
0
result_array.append( ['file_A','file_B','hamming_normalized','levenshtein_normalized','jaro_winkler','ratcliff_obershelp','jaccard'])
for filename_a in os.listdir('/app/input_txt_a'):

    if filename_a.endswith(".txt"):
        path_a = '/app/input_txt_a'+'/'+filename_a
        with open(path_a, 'r') as file:
            data_a = file.read()
            for filename_b in os.listdir('/app/input_txt_b'):
                if filename_b.endswith(".txt"):
                    path_b = '/app/input_txt_b'+'/'+filename_b
                    with open(path_b, 'r') as file:
                        data_b = file.read()
                        print('(A: '+filename_a+') VS (B: '+filename_b+')')
                        hamming_normalized = round( (textdistance.hamming.normalized_similarity(data_b, data_a)),2)
                        print('     Hamming percent normalized similarity: '+str(hamming_normalized))
                        levenshtein_normalized = round( (textdistance.levenshtein.normalized_similarity(data_b, data_a)),2)
                        print('     Levenshtein percent normalized similarity: '+str(levenshtein_normalized))
                        jaro_winkler = round( (textdistance.jaro_winkler(data_b, data_a)),2)
                        print('     Jaro/Winkler percent similarity: '+str(jaro_winkler))
                        ratcliff_obershelp = round( (textdistance.ratcliff_obershelp(data_b, data_a)),2)
                        print('     Ratcliff/Obershelp percent similarity: '+str(ratcliff_obershelp))
                        jaccard = round( (textdistance.jaccard(data_b, data_a)),2)
                        print('     Jaccard percent similarity: '+str(ratcliff_obershelp))
                        result_array.append([filename_a,filename_b,hamming_normalized,levenshtein_normalized,jaro_winkler,ratcliff_obershelp,jaccard])

now = datetime.now()
timestamp = datetime.timestamp(now)

with open("/app/output_csv/confrontation"+str(timestamp)+".csv","w+") as my_csv:
    csvWriter = csv.writer(my_csv,delimiter=',')
    csvWriter.writerows(result_array)
Пример #24
0
def SearchResultsView(request):
    model = Company, Filing, Funds, Directors, Proxies, Executives
    template_name = 'companyOverview.html'

    extended_template = 'base_company.html'
    if request.user.is_authenticated:
        extended_template = 'base_company_member.html'

    query = request.GET.get('q')
    mycompany = Company.objects.get(name=query)
    filings = Filing.objects.filter(cik=mycompany.cik).order_by('-filingdate')
    proxies = Proxies.objects.filter(cik=mycompany.cik).order_by('-filingdate')
    name = mycompany.name
    name = name.upper()
    name = name.replace('INTERNATIONAL', 'INTL')
    name = name.replace(' /DE', '')
    name = name.replace('/DE', '')
    name = name.replace('INC.', 'INC')
    name = name.replace(',', '')

    matches = []
    exectable = []

    funds = Funds.objects.raw(
        'SELECT * FROM edgarapp_funds WHERE company = %s ORDER BY share_prn_amount+0 DESC LIMIT 100',
        [name])

    directors = Directors.objects.filter(
        company=mycompany.name).order_by('-director')

    allDirectors = Directors.objects.all()

    executives = Executives.objects.filter(company=mycompany.name)
    today = datetime.today()
    currYear = today.year

    for year in executives:
        if year.filingdate.split('-')[0] == str(currYear):
            exectable.append(year)

    for person in directors:
        if person:
            personA = person.director.replace("Mr.", '')
            personA = person.director.replace("Dr.", '')
            personA = person.director.replace("Ms.", '')
            a = set([s for s in personA if s != "," and s != "." and s != " "])
            aLast = personA.split(' ')[-1]
            if (len(personA.split(' ')) == 1):
                aLast = personA.split('.')[-1]
        comps = []
        for check in allDirectors:
            if person:
                personB = check.director.replace("Mr.", '')
                personB = check.director.replace("Dr.", '')
                personB = check.director.replace("Ms.", '')
                bLast = personB.split(' ')[-1]
                if (len(personB.split(' ')) == 1):
                    bLast = personB.split('.')[-1]
                # print(personA, aLast, person.company, personB, bLast, check.company)
                if aLast == bLast:
                    # first check jaccard index to speed up algo, threshold of .65
                    b = set([
                        s for s in personB
                        if s != "," and s != "." and s != " "
                    ])
                    if (len(a.union(b)) != 0):
                        jaccard = float(
                            len(a.intersection(b)) / len(a.union(b)))
                    else:
                        jaccard = 1
                    # print(personA, personB, jaccard)
                    if (jaccard > 0.65):
                        # run Ratcliff-Obershel for further matching, threshold of .75 and prevent self-match
                        sequence = textdistance.ratcliff_obershelp(
                            personA, personB)
                        # print(sequence)
                        if sequence > 0.75 and mycompany.name != check.company:
                            comps.append(check.company)
        if not comps:
            comps.append('Director is not on the board of any other companies')
        matches.append(comps)

    object_list = []
    object_list.append(query)
    object_list.append((mycompany.name, mycompany.name))
    object_list.append(filings)
    object_list.append(funds)
    object_list.append(zip(directors, matches))
    object_list.append(zip(exectable, matches))
    # object_list.append(itertools.zip_longest(proxies, filings, fillvalue='foo'))

    # object_list is (q, (companyname, name), (filings object))
    if request.user.is_authenticated:
        return render(request, template_name, {
            'object_list': object_list,
            'extended_template': extended_template
        })
    else:
        if query == 'HD':
            return render(request, template_name, {
                'object_list': object_list,
                'extended_template': extended_template
            })
        else:
            return render(request, 'about.html',
                          {'extended_template': 'base.html'})
Пример #25
0
    n2 = dt.datetime.now()
    ji_time.append((n2 - n1).microseconds)
    #end = timeit.timeit()
    #ji_time.append(end - start)

    #start = timeit.timeit()
    n1 = dt.datetime.now()
    sd = textdistance.sorensen(raw_text1, raw_text2)
    n2 = dt.datetime.now()
    sd_time.append((n2 - n1).microseconds)
    #end = timeit.timeit()
    #sd_time.append(end - start)

    #start = timeit.timeit()
    n1 = dt.datetime.now()
    ro = textdistance.ratcliff_obershelp(raw_text1, raw_text2)
    n2 = dt.datetime.now()
    ro_time.append((n2 - n1).microseconds)
    #end = timeit.timeit()
    #ro_time.append(end - start)

print("jellyfish.jaro_distance")
print(sum(jd_time) / 50000)

print("jellyfish.levenshtein_distance")
print(sum(ld_time) / 50000)

print("textdistance.jaccard")
print(sum(ji_time) / 50000)

print("textdistance.sorensen")
Пример #26
0
def randcliff(string1, string2):
    return textdistance.ratcliff_obershelp(string1, string2)
credit = []
debit = []
row_data = []
        
i = 0
for ind, row in statement.iloc[:].iterrows():
    string1 = row['Clean Txn']
    for ind_copy, row_copy in statement.iloc[i:].iterrows():
        string2 = row_copy['Clean Txn']
        if (string1 + string2) not in table and (string2 + string1) not in table:
            table.append(string1 + string2)
            trans_date = row_copy['Trans Date']
            cred = row_copy['Deposit(CR)']
            deb = row_copy['Withdrawal(DR)']
            if deb > 5000 or deb == 0:
                score = td.ratcliff_obershelp(string1, string2)
                if score >= 0.7 and string2 not in right_side:
                    date.append(trans_date)
                    row_data.append(string1)
                    similarity.append(score)
                    right_side.append(string2)
                    credit.append(cred)
                    debit.append(deb)
                    if string1 not in left_side: left_side.append(string1)
                    else: left_side.append('-')
                    
#     similairity.append(process.extract(string1, string2.split(), scorer=fuzz.ratio)[0][1])          
#     print (table[i], ':', score[i])
#     for index in range(0, nr_matches):
        
    
Пример #28
0
def simple_example():
    str1, str2 = 'test', 'text'
    qval = 2

    #--------------------
    # Edit-based.
    if True:
        print("textdistance.hamming({}, {}) = {}.".format(
            str1, str2, textdistance.hamming(str1, str2)))
        print("textdistance.hamming.distance({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.distance(str1, str2)))
        print("textdistance.hamming.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.similarity(str1, str2)))
        print("textdistance.hamming.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.hamming.normalized_distance(str1, str2)))
        print(
            "textdistance.hamming.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.hamming.normalized_similarity(str1, str2)))
        print(
            "textdistance.Hamming(qval={}, test_func=None, truncate=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Hamming(qval=qval,
                                     test_func=None,
                                     truncate=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.mlipns({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns(str1, str2)))
        print("textdistance.mlipns.distance({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.distance(str1, str2)))
        print("textdistance.mlipns.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.similarity(str1, str2)))
        print("textdistance.mlipns.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.normalized_distance(str1, str2)))
        print("textdistance.mlipns.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mlipns.normalized_similarity(str1, str2)))
        print(
            "textdistance.MLIPNS(threshold=0.25, maxmismatches=2, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.MLIPNS(threshold=0.25,
                                    maxmismatches=2,
                                    qval=qval,
                                    external=True).distance(str1, str2)))

        print("textdistance.levenshtein({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein(str1, str2)))
        print("textdistance.levenshtein.distance({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein.distance(str1, str2)))
        print("textdistance.levenshtein.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.levenshtein.similarity(str1, str2)))
        print("textdistance.levenshtein.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.levenshtein.normalized_distance(str1, str2)))
        print("textdistance.levenshtein.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.levenshtein.normalized_similarity(str1, str2)))
        print(
            "textdistance.Levenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Levenshtein(qval=qval,
                                         test_func=None,
                                         external=True).distance(str1, str2)))

        print("textdistance.damerau_levenshtein({}, {}) = {}.".format(
            str1, str2, textdistance.damerau_levenshtein(str1, str2)))
        print("textdistance.damerau_levenshtein.distance({}, {}) = {}.".format(
            str1, str2, textdistance.damerau_levenshtein.distance(str1, str2)))
        print(
            "textdistance.damerau_levenshtein.similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.damerau_levenshtein.similarity(str1, str2)))
        print(
            "textdistance.damerau_levenshtein.normalized_distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.damerau_levenshtein.normalized_distance(
                    str1, str2)))
        print(
            "textdistance.damerau_levenshtein.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.damerau_levenshtein.normalized_similarity(
                    str1, str2)))
        print(
            "textdistance.DamerauLevenshtein(qval={}, test_func=None, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.DamerauLevenshtein(qval=qval,
                                                test_func=None,
                                                external=True).distance(
                                                    str1, str2)))

        print("textdistance.jaro({}, {}) = {}.".format(
            str1, str2, textdistance.jaro(str1, str2)))
        print("textdistance.jaro.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.distance(str1, str2)))
        print("textdistance.jaro.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.similarity(str1, str2)))
        print("textdistance.jaro.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.normalized_distance(str1, str2)))
        print("textdistance.jaro.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro.normalized_similarity(str1, str2)))
        print(
            "textdistance.Jaro(long_tolerance=False, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Jaro(long_tolerance=False,
                                  qval=qval,
                                  external=True).distance(str1, str2)))

        print("textdistance.jaro_winkler({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler(str1, str2)))
        print("textdistance.jaro_winkler.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler.distance(str1, str2)))
        print("textdistance.jaro_winkler.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaro_winkler.similarity(str1, str2)))
        print("textdistance.jaro_winkler.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.jaro_winkler.normalized_distance(str1,
                                                                   str2)))
        print("textdistance.jaro_winkler.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.jaro_winkler.normalized_similarity(str1, str2)))
        print(
            "textdistance.JaroWinkler(long_tolerance=False, winklerize=True, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.JaroWinkler(long_tolerance=False,
                                         winklerize=True,
                                         qval=qval,
                                         external=True).distance(str1, str2)))

        print("textdistance.strcmp95({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95(str1, str2)))
        print("textdistance.strcmp95.distance({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.distance(str1, str2)))
        print("textdistance.strcmp95.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.similarity(str1, str2)))
        print("textdistance.strcmp95.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.strcmp95.normalized_distance(str1, str2)))
        print(
            "textdistance.strcmp95.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.strcmp95.normalized_similarity(str1, str2)))
        print(
            "textdistance.StrCmp95(long_strings=False, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.StrCmp95(long_strings=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.needleman_wunsch({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch(str1, str2)))
        print("textdistance.needleman_wunsch.distance({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch.distance(str1, str2)))
        print("textdistance.needleman_wunsch.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.needleman_wunsch.similarity(str1, str2)))
        print(
            "textdistance.needleman_wunsch.normalized_distance({}, {}) = {}.".
            format(
                str1, str2,
                textdistance.needleman_wunsch.normalized_distance(str1, str2)))
        print(
            "textdistance.needleman_wunsch.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.needleman_wunsch.normalized_similarity(
                    str1, str2)))
        print(
            "textdistance.NeedlemanWunsch(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.NeedlemanWunsch(gap_cost=1.0,
                                             sim_func=None,
                                             qval=qval,
                                             external=True).distance(
                                                 str1, str2)))

        print("textdistance.gotoh({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh(str1, str2)))
        print("textdistance.gotoh.distance({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.distance(str1, str2)))
        print("textdistance.gotoh.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.similarity(str1, str2)))
        print("textdistance.gotoh.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.normalized_distance(str1, str2)))
        print("textdistance.gotoh.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.gotoh.normalized_similarity(str1, str2)))
        print(
            "textdistance.Gotoh(gap_open=1, gap_ext=0.4, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Gotoh(gap_open=1,
                                   gap_ext=0.4,
                                   sim_func=None,
                                   qval=qval,
                                   external=True).distance(str1, str2)))

        print("textdistance.smith_waterman({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman(str1, str2)))
        print("textdistance.smith_waterman.distance({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman.distance(str1, str2)))
        print("textdistance.smith_waterman.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.smith_waterman.similarity(str1, str2)))
        print("textdistance.smith_waterman.normalized_distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.smith_waterman.normalized_distance(str1, str2)))
        print(
            "textdistance.smith_waterman.normalized_similarity({}, {}) = {}.".
            format(
                str1, str2,
                textdistance.smith_waterman.normalized_similarity(str1, str2)))
        print(
            "textdistance.SmithWaterman(gap_cost=1.0, sim_func=None, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.SmithWaterman(gap_cost=1.0,
                                           sim_func=None,
                                           qval=qval,
                                           external=True).distance(str1,
                                                                   str2)))

    #--------------------
    # Token-based.
    if False:
        print("textdistance.jaccard({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard(str1, str2)))
        print("textdistance.jaccard.distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.distance(str1, str2)))
        print("textdistance.jaccard.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.similarity(str1, str2)))
        print("textdistance.jaccard.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.jaccard.normalized_distance(str1, str2)))
        print(
            "textdistance.jaccard.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.jaccard.normalized_similarity(str1, str2)))
        print(
            "textdistance.Jaccard(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Jaccard(qval=qval, as_set=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.sorensen({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen(str1, str2)))
        print("textdistance.sorensen.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.distance(str1, str2)))
        print("textdistance.sorensen.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.similarity(str1, str2)))
        print("textdistance.sorensen.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen.normalized_distance(str1, str2)))
        print(
            "textdistance.sorensen.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.sorensen.normalized_similarity(str1, str2)))
        print(
            "textdistance.Sorensen(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Sorensen(qval=qval, as_set=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.sorensen_dice({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice(str1, str2)))
        print("textdistance.sorensen_dice.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice.distance(str1, str2)))
        print("textdistance.sorensen_dice.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sorensen_dice.similarity(str1, str2)))
        print("textdistance.sorensen_dice.normalized_distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.sorensen_dice.normalized_distance(str1, str2)))
        print("textdistance.sorensen_dice.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.sorensen_dice.normalized_similarity(str1,
                                                                   str2)))
        #print("textdistance.SorensenDice().distance({}, {}) = {}.".format(str1, str2, textdistance.SorensenDice().distance(str1, str2)))

        print("textdistance.tversky({}, {}) = {}.".format(
            str1, str2, textdistance.tversky(str1, str2)))
        print("textdistance.tversky.distance({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.distance(str1, str2)))
        print("textdistance.tversky.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.similarity(str1, str2)))
        print("textdistance.tversky.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.tversky.normalized_distance(str1, str2)))
        print(
            "textdistance.tversky.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.tversky.normalized_similarity(str1, str2)))
        print(
            "textdistance.Tversky(qval={}, ks=None, bias=None, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Tversky(qval=qval,
                                     ks=None,
                                     bias=None,
                                     as_set=False,
                                     external=True).distance(str1, str2)))

        print("textdistance.overlap({}, {}) = {}.".format(
            str1, str2, textdistance.overlap(str1, str2)))
        print("textdistance.overlap.distance({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.distance(str1, str2)))
        print("textdistance.overlap.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.similarity(str1, str2)))
        print("textdistance.overlap.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.overlap.normalized_distance(str1, str2)))
        print(
            "textdistance.overlap.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.overlap.normalized_similarity(str1, str2)))
        print(
            "textdistance.Overlap(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Overlap(qval=qval, as_set=False,
                                     external=True).distance(str1, str2)))

        # This is identical to the Jaccard similarity coefficient and the Tversky index for alpha=1 and beta=1.
        print("textdistance.tanimoto({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto(str1, str2)))
        print("textdistance.tanimoto.distance({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.distance(str1, str2)))
        print("textdistance.tanimoto.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.similarity(str1, str2)))
        print("textdistance.tanimoto.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.tanimoto.normalized_distance(str1, str2)))
        print(
            "textdistance.tanimoto.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.tanimoto.normalized_similarity(str1, str2)))
        print(
            "textdistance.Tanimoto(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Tanimoto(qval=qval, as_set=False,
                                      external=True).distance(str1, str2)))

        print("textdistance.cosine({}, {}) = {}.".format(
            str1, str2, textdistance.cosine(str1, str2)))
        print("textdistance.cosine.distance({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.distance(str1, str2)))
        print("textdistance.cosine.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.similarity(str1, str2)))
        print("textdistance.cosine.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.normalized_distance(str1, str2)))
        print("textdistance.cosine.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.cosine.normalized_similarity(str1, str2)))
        print(
            "textdistance.Cosine(qval={}, as_set=False, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Cosine(qval=qval, as_set=False,
                                    external=True).distance(str1, str2)))

        print("textdistance.monge_elkan({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan(str1, str2)))
        print("textdistance.monge_elkan.distance({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan.distance(str1, str2)))
        print("textdistance.monge_elkan.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.monge_elkan.similarity(str1, str2)))
        print("textdistance.monge_elkan.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.monge_elkan.normalized_distance(str1, str2)))
        print("textdistance.monge_elkan.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.monge_elkan.normalized_similarity(str1, str2)))
        print(
            "textdistance.MongeElkan(algorithm=textdistance.DamerauLevenshtein(), symmetric=False, qval={}, external=True).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.MongeElkan(
                    algorithm=textdistance.DamerauLevenshtein(),
                    symmetric=False,
                    qval=qval,
                    external=True).distance(str1, str2)))

        print("textdistance.bag({}, {}) = {}.".format(
            str1, str2, textdistance.bag(str1, str2)))
        print("textdistance.bag.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bag.distance(str1, str2)))
        print("textdistance.bag.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bag.similarity(str1, str2)))
        print("textdistance.bag.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.bag.normalized_distance(str1, str2)))
        print("textdistance.bag.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bag.normalized_similarity(str1, str2)))
        print("textdistance.Bag(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.Bag(qval=qval).distance(str1, str2)))

    #--------------------
    # Sequence-based.
    if False:
        print("textdistance.lcsseq({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq(str1, str2)))
        print("textdistance.lcsseq.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.distance(str1, str2)))
        print("textdistance.lcsseq.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.similarity(str1, str2)))
        print("textdistance.lcsseq.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.normalized_distance(str1, str2)))
        print("textdistance.lcsseq.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsseq.normalized_similarity(str1, str2)))
        #print("textdistance.LCSSeq(qval={}, test_func=None, external=True).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.LCSSeq(qval=qval, test_func=None, external=True).distance(str1, str2)))
        print("textdistance.LCSSeq().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.LCSSeq().distance(str1, str2)))

        print("textdistance.lcsstr({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr(str1, str2)))
        print("textdistance.lcsstr.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.distance(str1, str2)))
        print("textdistance.lcsstr.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.similarity(str1, str2)))
        print("textdistance.lcsstr.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.normalized_distance(str1, str2)))
        print("textdistance.lcsstr.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lcsstr.normalized_similarity(str1, str2)))
        print("textdistance.LCSStr(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.LCSStr(qval=qval).distance(str1, str2)))

        print("textdistance.ratcliff_obershelp({}, {}) = {}.".format(
            str1, str2, textdistance.ratcliff_obershelp(str1, str2)))
        print("textdistance.ratcliff_obershelp.distance({}, {}) = {}.".format(
            str1, str2, textdistance.ratcliff_obershelp.distance(str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.ratcliff_obershelp.similarity(str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.normalized_distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.ratcliff_obershelp.normalized_distance(
                    str1, str2)))
        print(
            "textdistance.ratcliff_obershelp.normalized_similarity({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.ratcliff_obershelp.normalized_similarity(
                    str1, str2)))
        print("textdistance.RatcliffObershelp().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.RatcliffObershelp().distance(str1, str2)))

    #--------------------
    # Compression-based.
    if False:
        print("textdistance.arith_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd(str1, str2)))
        print("textdistance.arith_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd.distance(str1, str2)))
        print("textdistance.arith_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.arith_ncd.similarity(str1, str2)))
        print(
            "textdistance.arith_ncd.normalized_distance({}, {}) = {}.".format(
                str1, str2,
                textdistance.arith_ncd.normalized_distance(str1, str2)))
        print("textdistance.arith_ncd.normalized_similarity({}, {}) = {}.".
              format(str1, str2,
                     textdistance.arith_ncd.normalized_similarity(str1, str2)))
        #print("textdistance.ArithNCD(base=2, terminator=None, qval={}).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.ArithNCD(base=2, terminator=None, qval=qval).distance(str1, str2)))
        print("textdistance.ArithNCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.ArithNCD().distance(str1, str2)))

        print("textdistance.rle_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd(str1, str2)))
        print("textdistance.rle_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.distance(str1, str2)))
        print("textdistance.rle_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.similarity(str1, str2)))
        print("textdistance.rle_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.rle_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.rle_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.rle_ncd.normalized_similarity(str1, str2)))
        print("textdistance.RLENCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.RLENCD().distance(str1, str2)))

        print("textdistance.bwtrle_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd(str1, str2)))
        print("textdistance.bwtrle_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd.distance(str1, str2)))
        print("textdistance.bwtrle_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bwtrle_ncd.similarity(str1, str2)))
        print(
            "textdistance.bwtrle_ncd.normalized_distance({}, {}) = {}.".format(
                str1, str2,
                textdistance.bwtrle_ncd.normalized_distance(str1, str2)))
        print("textdistance.bwtrle_ncd.normalized_similarity({}, {}) = {}.".
              format(str1, str2,
                     textdistance.bwtrle_ncd.normalized_similarity(str1,
                                                                   str2)))
        print("textdistance.BWTRLENCD(terminator='\0').distance({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.BWTRLENCD(terminator='\0').distance(str1,
                                                                   str2)))

        print("textdistance.sqrt_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd(str1, str2)))
        print("textdistance.sqrt_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.distance(str1, str2)))
        print("textdistance.sqrt_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.similarity(str1, str2)))
        print("textdistance.sqrt_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.sqrt_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.sqrt_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.sqrt_ncd.normalized_similarity(str1, str2)))
        print("textdistance.SqrtNCD(qval={}).distance({}, {}) = {}.".format(
            qval, str1, str2,
            textdistance.SqrtNCD(qval=qval).distance(str1, str2)))

        print("textdistance.entropy_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd(str1, str2)))
        print("textdistance.entropy_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd.distance(str1, str2)))
        print("textdistance.entropy_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.entropy_ncd.similarity(str1, str2)))
        print("textdistance.entropy_ncd.normalized_distance({}, {}) = {}.".
              format(str1, str2,
                     textdistance.entropy_ncd.normalized_distance(str1, str2)))
        print("textdistance.entropy_ncd.normalized_similarity({}, {}) = {}.".
              format(
                  str1, str2,
                  textdistance.entropy_ncd.normalized_similarity(str1, str2)))
        print(
            "textdistance.EntropyNCD(qval={}, coef=1, base=2).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.EntropyNCD(qval=qval, coef=1,
                                        base=2).distance(str1, str2)))

        print("textdistance.bz2_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd(str1, str2)))
        print("textdistance.bz2_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.distance(str1, str2)))
        print("textdistance.bz2_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.similarity(str1, str2)))
        print("textdistance.bz2_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.bz2_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.bz2_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.bz2_ncd.normalized_similarity(str1, str2)))
        print("textdistance.BZ2NCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.BZ2NCD().distance(str1, str2)))

        print("textdistance.lzma_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd(str1, str2)))
        print("textdistance.lzma_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.distance(str1, str2)))
        print("textdistance.lzma_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.similarity(str1, str2)))
        print("textdistance.lzma_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.lzma_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.lzma_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.lzma_ncd.normalized_similarity(str1, str2)))
        print("textdistance.LZMANCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.LZMANCD().distance(str1, str2)))

        print("textdistance.zlib_ncd({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd(str1, str2)))
        print("textdistance.zlib_ncd.distance({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.distance(str1, str2)))
        print("textdistance.zlib_ncd.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.similarity(str1, str2)))
        print("textdistance.zlib_ncd.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.zlib_ncd.normalized_distance(str1, str2)))
        print(
            "textdistance.zlib_ncd.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.zlib_ncd.normalized_similarity(str1, str2)))
        print("textdistance.ZLIBNCD().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.ZLIBNCD().distance(str1, str2)))

    #--------------------
    # Phonetic.
    if False:
        print("textdistance.mra({}, {}) = {}.".format(
            str1, str2, textdistance.mra(str1, str2)))
        print("textdistance.mra.distance({}, {}) = {}.".format(
            str1, str2, textdistance.mra.distance(str1, str2)))
        print("textdistance.mra.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mra.similarity(str1, str2)))
        print("textdistance.mra.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.mra.normalized_distance(str1, str2)))
        print("textdistance.mra.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.mra.normalized_similarity(str1, str2)))
        print("textdistance.MRA().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.MRA().distance(str1, str2)))

        print("textdistance.editex({}, {}) = {}.".format(
            str1, str2, textdistance.editex(str1, str2)))
        print("textdistance.editex.distance({}, {}) = {}.".format(
            str1, str2, textdistance.editex.distance(str1, str2)))
        print("textdistance.editex.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.editex.similarity(str1, str2)))
        print("textdistance.editex.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.editex.normalized_distance(str1, str2)))
        print("textdistance.editex.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.editex.normalized_similarity(str1, str2)))
        print(
            "textdistance.Editex(local=False, match_cost=0, group_cost=1, mismatch_cost=2, groups=None, ungrouped=None, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.Editex(local=False,
                                    match_cost=0,
                                    group_cost=1,
                                    mismatch_cost=2,
                                    groups=None,
                                    ungrouped=None,
                                    external=True).distance(str1, str2)))

    #--------------------
    # Simple.
    if False:
        print("textdistance.prefix({}, {}) = {}.".format(
            str1, str2, textdistance.prefix(str1, str2)))
        print("textdistance.prefix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.distance(str1, str2)))
        print("textdistance.prefix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.similarity(str1, str2)))
        print("textdistance.prefix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.normalized_distance(str1, str2)))
        print("textdistance.prefix.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.prefix.normalized_similarity(str1, str2)))
        print(
            "textdistance.Prefix(qval={}, sim_test=None).distance({}, {}) = {}."
            .format(
                qval, str1, str2,
                textdistance.Prefix(qval=qval,
                                    sim_test=None).distance(str1, str2)))

        print("textdistance.postfix({}, {}) = {}.".format(
            str1, str2, textdistance.postfix(str1, str2)))
        print("textdistance.postfix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.distance(str1, str2)))
        print("textdistance.postfix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.similarity(str1, str2)))
        print("textdistance.postfix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.postfix.normalized_distance(str1, str2)))
        print(
            "textdistance.postfix.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.postfix.normalized_similarity(str1, str2)))
        #print("textdistance.Postfix(qval={}, sim_test=None).distance({}, {}) = {}.".format(qval, str1, str2, textdistance.Postfix(qval=qval, sim_test=None).distance(str1, str2)))
        print("textdistance.Postfix().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Postfix().distance(str1, str2)))

        print("textdistance.length({}, {}) = {}.".format(
            str1, str2, textdistance.length(str1, str2)))
        print("textdistance.length.distance({}, {}) = {}.".format(
            str1, str2, textdistance.length.distance(str1, str2)))
        print("textdistance.length.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.length.similarity(str1, str2)))
        print("textdistance.length.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.length.normalized_distance(str1, str2)))
        print("textdistance.length.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.length.normalized_similarity(str1, str2)))
        print("textdistance.Length().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Length().distance(str1, str2)))

        print("textdistance.identity({}, {}) = {}.".format(
            str1, str2, textdistance.identity(str1, str2)))
        print("textdistance.identity.distance({}, {}) = {}.".format(
            str1, str2, textdistance.identity.distance(str1, str2)))
        print("textdistance.identity.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.identity.similarity(str1, str2)))
        print("textdistance.identity.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.identity.normalized_distance(str1, str2)))
        print(
            "textdistance.identity.normalized_similarity({}, {}) = {}.".format(
                str1, str2,
                textdistance.identity.normalized_similarity(str1, str2)))
        print("textdistance.Identity().distance({}, {}) = {}.".format(
            str1, str2,
            textdistance.Identity().distance(str1, str2)))

        print("textdistance.matrix({}, {}) = {}.".format(
            str1, str2, textdistance.matrix(str1, str2)))
        print("textdistance.matrix.distance({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.distance(str1, str2)))
        print("textdistance.matrix.similarity({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.similarity(str1, str2)))
        print("textdistance.matrix.normalized_distance({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.normalized_distance(str1, str2)))
        print("textdistance.matrix.normalized_similarity({}, {}) = {}.".format(
            str1, str2, textdistance.matrix.normalized_similarity(str1, str2)))
        print(
            "textdistance.Matrix(mat=None, mismatch_cost=0, match_cost=1, symmetric=True, external=True).distance({}, {}) = {}."
            .format(
                str1, str2,
                textdistance.Matrix(mat=None,
                                    mismatch_cost=0,
                                    match_cost=1,
                                    symmetric=True,
                                    external=True).distance(str1, str2)))
Пример #29
0
def ratcliff_obershelp_similarity(row, attr):
    x = row[attr + "_l"].lower()
    y = row[attr + "_r"].lower()

    return td.ratcliff_obershelp(x, y)
Пример #30
0
def SearchFilingView(request):
    model = Company, Filing, Proxies
    template_name = 'companyFiling.html'

    extended_template = 'base_company.html'
    if request.user.is_authenticated:
        extended_template = 'base_company_member.html'

    matches = []
    exectable = []

    query = request.GET.get('q')
    fid = request.GET.get('fid')
    mycompany = Company.objects.get(ticker=query)
    filings = Filing.objects.filter(cik=mycompany.cik).order_by('-filingdate')
    filing = Filing.objects.get(id=fid)  # the filing requested by fid

    name = mycompany.name
    name = name.upper()
    name = name.replace('INTERNATIONAL', 'INTL')
    name = name.replace(' /DE', '')
    name = name.replace('/DE', '')
    name = name.replace('INC.', 'INC')
    name = name.replace(',', '')

    funds = Funds.objects.raw(
        'SELECT * FROM edgarapp_funds WHERE company = %s ORDER BY share_prn_amount+0 DESC LIMIT 100',
        [name])

    directors = Directors.objects.filter(
        company=mycompany.name).order_by('-director')

    allDirectors = Directors.objects.all()

    executives = Executives.objects.filter(company=mycompany.name)

    today = datetime.today()
    currYear = today.year

    for year in executives:
        if year.filingdate.split('-')[0] == str(currYear):
            exectable.append(year)

    for person in directors:
        if person:
            personA = person.director.replace("Mr.", '')
            personA = person.director.replace("Dr.", '')
            personA = person.director.replace("Ms.", '')
            a = set([s for s in personA if s != "," and s != "." and s != " "])
            aLast = personA.split(' ')[-1]
            if (len(personA.split(' ')) == 1):
                aLast = personA.split('.')[-1]
        comps = []
        for check in allDirectors:
            if person:
                personB = check.director.replace("Mr.", '')
                personB = check.director.replace("Dr.", '')
                personB = check.director.replace("Ms.", '')
                bLast = personB.split(' ')[-1]
                if (len(personB.split(' ')) == 1):
                    bLast = personB.split('.')[-1]
                # print(personA, aLast, person.company, personB, bLast, check.company)
                if aLast == bLast:
                    # first check jaccard index to speed up algo, threshold of .65
                    b = set([
                        s for s in personB
                        if s != "," and s != "." and s != " "
                    ])
                    if (len(a.union(b)) != 0):
                        jaccard = float(
                            len(a.intersection(b)) / len(a.union(b)))
                    else:
                        jaccard = 1
                    # print(personA, personB, jaccard)
                    if (jaccard > 0.65):
                        # run Ratcliff-Obershel for further matching, threshold of .75 and prevent self-match
                        sequence = textdistance.ratcliff_obershelp(
                            personA, personB)
                        # print(sequence)
                        if sequence > 0.75 and mycompany.name != check.company:
                            comps.append(check.company)
        if not comps:
            comps.append('Director is not on the board of any other companies')
        matches.append(comps)

    url = '/mnt/filings-static/capitalrap/edgarapp/static/filings/' + filing.filingpath

    # page = open(url)
    # finder = filing.filingpath.split('/')[1]+"#"
    # soup = BeautifulSoup(page.read())
    links = []
    verify = []
    # for link in soup.find_all('a'):
    #   x = link.get('href')
    #   if str(x).startswith('https') or str(x).startswith('http'):
    #     if x.find('#') != -1:
    #       if link.string.find('Table of Contents') == -1 or x.endswith("#INDEX") == -1:
    #         # print(link.string.endswith("Index"))
    #         if link.string.endswith("Index") == False:
    #           # print('not present')
    #           if x in verify:
    #             for item in links:
    #               if x.find(item["url"]) != -1:
    #                 # print(link.string)
    #                 itemIndex = links.index(item)
    #                 # print("index", itemIndex)
    #                 del links[itemIndex]
    #                 store = {
    #                   "value": item["value"] + " " + link.string,
    #                   "url": item["url"]
    #                 }
    #                 links.append(store)
    #           else:
    #             # print('false')
    #             verify.append(x)
    #             store = {
    #               "value": link.string,
    #               "url": "#"+x.split('#')[1]
    #             }
    #             links.append(store)

    object_list = []
    object_list.append((query, fid))
    object_list.append((mycompany.name, mycompany.ticker))
    object_list.append(filings)
    object_list.append(filing)
    object_list.append(funds)
    object_list.append(zip(directors, matches))
    object_list.append(zip(exectable, matches))
    object_list.append(links)

    # print(finder)
    toc_extractor = TOCExtractor()

    with open(url) as file:

        filing_html = file.read()

        try:
            extract_data = toc_extractor.extract(filing_html)
            table_of_contents = extract_data.table
        except:
            table_of_contents = ""

    # object_list is ((q, fid), (companyname, name), (filings object), (filing))
    return render(
        request, template_name, {
            'object_list': object_list,
            'extended_template': extended_template,
            'table_of_contents': table_of_contents,
            'filing_html': filing_html
        })