示例#1
0
def test_distance():
    assert_equal(levenshtein(S1, S2), 3)
    assert_equal(levenshtein(U1, U2), 3)
    assert_equal(levenshtein(S2, S1), 3)
    assert_equal(levenshtein(U2, U1), 3)

    for x in (S1, S2, U1, U2):
        assert_equal(levenshtein(x, x), 0)
示例#2
0
def file_editdistance_eval(path1, path2):
    with open(path1, 'r') as f:
        str1 = strip_whitespace(f.read())
    with open(path2, 'r') as f:
        str2 = strip_whitespace(f.read())

    return levenshtein(str1, str2)
def distanceMetric(start,end):
    input = open("actionList.pkl","rb")
    actions = pickle.load(input)
    actionStrings = []
    for item in actions:
        actionstring = ""
        for i in item:
            if i == -1:
                i = 255
            actionstring += chr(i)
        actionStrings.append(actionstring)
        
    if end > len(actions):
        end = len(actions)
    distanceMextric = []
    for i in range(start,end):
        print "process: " , i
        dist = []
        for j in range(0,len(actionStrings)):
            distance = levenshtein(actionStrings[i], actionStrings[j])
            #print distance
            dist.append(distance)
        distanceMextric.append(dist)
    output = open("distMetric" + str(start) + ".pkl","wb")
    pickle.dump(distanceMextric,output)
    output.close()
示例#4
0
    def feature_TITLE(self, lnk, re_label_text, features):
        label_text = unicode(lnk["label"])

        re_title = stringUtils.ngramToPattern(lnk['title'])
        article_title = unicode(lnk['title'])

        features["NCT"] = 0 if re.search(re_title, label_text) is None \
            else 1

        features["TCN"] = 0 \
            if re.search(re_label_text, article_title) is None else 1

        features["TEN"] = 1 if article_title == label_text else 0

        # Irritatingly enough, split() can give you empty values as last
        # element
        split_label = self.re_non_word_chars.split(label_text)
        if split_label[-1] == '':
            split_label.pop()
        split_title = self.re_non_word_chars.split(article_title)
        if split_title[-1] == '':
            split_title.pop()

        # I: True if the title of the candidate begins with the the query
        # (e.g. "Cambridge, Massachusetts" and "Cambridge" )
        features["SUBSTRING_MATCH_1"] = 1 \
            if split_title[0] == split_label[0] else 0

        # II: True if the title of the candidate ends with the the query
        # (e.g: "Venice-Simplon Orient Express" and "Orient Express")
        features["SUBSTRING_MATCH_2"] = 1 \
            if split_title[-1] == split_label[-1] else 0

        # collections.Counter() converts an array to a dict of words
        # and their frequencies
        cSplitLabel = collections.Counter(split_label)
        cSplitTitle = collections.Counter(split_title)

        # Number of shared words between the title of the candidate and
        # the  query
        features['WORD_MATCH'] = len(list(cSplitLabel & cSplitTitle))

        # Number of different words between the title of the candidate
        # and the query
        features['WORD_MISS'] = len(split_label) + len(split_title) \
            - (2 * features['WORD_MATCH'])

        # Levenshtein distance between query and title of the candidate
        features["EDIT_DISTANCE"] = levenshtein(label_text, article_title)
示例#5
0
def test_normalize():
    assert_true(isinstance(levenshtein(S2, S1), numbers.Integral))
    assert_equal(levenshtein("", "", normalize=True), 0)
    assert_equal(levenshtein(S1, S2, normalize=True), 3 / 7)
示例#6
0
def metric_levenshtein(predictions, labels):
    predictions = predictions.softmax().topk(axis=2).asnumpy()
    zipped = zip(decode(labels.asnumpy()), decode(predictions))
    metric = sum([(len(label) - levenshtein(label, pred)) / len(label)
                  for label, pred in zipped])
    return metric / len(labels)
示例#7
0
def lev_metric(x, y):
    i, j = int(x[0]), int(y[0])     # extract indices
    return levenshtein(data[i], data[j])
示例#8
0
    r = requests.get('http://0.0.0.0:8000/circle?_f=%s&_t=%s' % (f, t))
except:
    print('smurf hug api started?')
    print('try: hug -m smurf.hug_api')
    sys.exit(-1)

rj = r.json()

rjc = cycle(rj)
l = len(rj)
p = n = None
print('circle:edit distances should be 1')
while l:
    if not p: p = next(rjc)
    n = next(rjc)
    print(p, n, leven.levenshtein(p, n))
    p = n
    l -= 1

print('random:edit distances should not be always 1')
l = len(rj)
p = n = None
random.shuffle(rj)
rjc = cycle(rj)
while l:
    if not p: p = next(rjc)
    n = next(rjc)
    print(p, n, leven.levenshtein(p, n))
    p = n
    l -= 1
示例#9
0
def checkAuthorMatch(article1,
                     article2,
                     authorName=None,
                     yearGap=2,
                     nCommonAuths=3,
                     matchThresh=0.70,
                     absMatchLimit=0.5,
                     titleMatchLimit=0.5,
                     keywordMatchTol=2):
    """
    Check if two articles are probably from the same author.

    I think they are if:
    1) They are published within yearGap of eachother, from the same location
    or
    2) They share nCommonRefs or more
    or
    3) They share nCommonAuths or more
    or
    4) Author lists are >1 and have the same names.
    or
    5) Author of interest is spelled exactly the same, and they share 2+ keywords

    matchThresh sets the theshold for doing fuzzy string comparison with the
    affiliation names (so, 'Univeristy of Washington' will match
    'University of Washington, Astronomy Department'

    returns
    -------
    bool

    """
    result = False
    # If any of the criteria match, return True and bail out

    if authorName is None:
        authorName = authSimple(article1.author[0])

    # If author list is >1 and identical
    if len(article1.authorset) > 1:
        if article1.authorset == article2.authorset:
            return True

    # If published within yearGap from same location
    if hasattr(article1, 'year') & hasattr(article2, 'year'):
        if np.abs(int(article1.year) - int(article2.year)) <= yearGap:
            aff1 = None
            aff2 = None
            lDists = [
                levenshtein(unicode(authorName), unicode(authSimple(name)))
                for name in article1.author
            ]
            good = np.where(lDists == np.min(lDists))[0]
            # Can't tell which one is author, can't link it up.
            if np.size(good) > 1:
                return False
            if lDists[good] < 3:
                aff1 = affClean(article1.aff[good])
            lDists = [
                levenshtein(unicode(authorName), unicode(authSimple(name)))
                for name in article2.author
            ]
            good = np.where(lDists == np.min(lDists))[0]
            if np.size(good) > 1:
                return False
            if lDists[good] < 3:
                aff2 = affClean(article2.aff[good])
            if (aff1 is not None) & (aff2 is not None):
                if checkAffMatch(aff1, aff2, matchThresh=matchThresh):
                    return True

    # Check if they have enough common authors
    commonAuthors = article1.authorset.intersection(article2.authorset)
    if len(commonAuthors) >= nCommonAuths:
        return True

    # If the keywords are in common
    if (article1.keyword is not None) & (article2.keyword is not None):
        if len(set(article1.keyword)
               & set(article2.keyword)) > keywordMatchTol:
            # If the author is spelled exactly the same, and there are matching keywords
            a1 = [
                author for author in article1.author
                if authSimple(author) == authSimple(authorName)
            ]
            a2 = [
                author for author in article2.author
                if authSimple(author) == authSimple(authorName)
            ]
            if (len(a1) == 1) & (len(a2) == 1):
                # If the names match exactly
                if a1[0] == a2[0]:
                    return True

    return result
示例#10
0
def test_normalize():
    assert_true(isinstance(levenshtein(S2, S1), numbers.Integral))
    assert_equal(levenshtein("", "", normalize=True), 0)
    assert_equal(levenshtein(S1, S2, normalize=True), 3 / 7)
 def filter_by_levenshtein(self, word_spell, distance):
     words = self.words
     for word in words:
         if levenshtein(word_spell, word.entry.form) <= distance:
             yield word
示例#12
0
    def generate_sequences(self, inputs, begin_states, sentence):
        samples, scores, valid_lengths = self.sampler(inputs, begin_states)
        samples = samples[0].asnumpy()
        scores = scores[0].asnumpy()
        valid_lengths = valid_lengths[0].asnumpy()
        max_score = -10e20

        # Heuristic #1
        #If the sentence is correct, let's not try to change it
        sentence_tokenized = [
            i.replace("&quot;", '"').replace("&apos;",
                                             "'").replace("&amp;", "&")
            for i in self.tokenizer(sentence)
        ]
        sentence_correct = True
        for token in sentence_tokenized:
            if (token not in self.vocab
                    or self.vocab[token] > 400000) and token.lower() not in [
                        "don't", "doesn't", "can't", "won't", "ain't",
                        "couldn't", "i'd", "you'd", "he's", "she's", "it's",
                        "i've", "you've", "she'd"
                    ]:
                sentence_correct = False
                break
        if sentence_correct:
            return sentence

        # Heuristic #2
        # We want sentence that have the most in-vocabulary words
        # and we penalize sentences that have out of vocabulary words
        # that do not start with a capital letter
        for i, sample in enumerate(samples):
            tokens = decode_char(sample[:valid_lengths[i]])
            tokens = [
                i.replace("&quot;", '"').replace("&apos;",
                                                 "'").replace("&amp;", "&")
                for i in self.tokenizer(tokens)
            ]
            score = 0

            for t in tokens:
                # Boosting names
                if (t in self.vocab
                        and self.vocab[t] < 450000) or (len(t) > 0
                                                        and t.istitle()):
                    score += 0
                else:
                    score -= 1
                score -= 0
            if score == max_score:
                max_score = score
                best_tokens.append(tokens)
            elif score > max_score:
                max_score = score
                best_tokens = [tokens]

        # Heurisitic #3
        # Smallest edit distance
        # We then take the sentence with the lowest edit distance
        # From the predicted original sentence
        best_dist = 1000
        output_tokens = best_tokens[0]
        best_tokens_ = []
        for tokens in best_tokens:
            dist = leven.levenshtein(sentence,
                                     ' '.join(self.detokenizer(tokens)))
            if dist < best_dist:
                best_dist = dist
                best_tokens_ = [tokens]
            elif dist == best_dist:
                best_tokens_.append(tokens)

        # Heuristic #4
        # We take the sentence with the smallest number of tokens
        # to avoid split up composed words
        min_len = 10e20
        for tokens in best_tokens_:
            if len(tokens) < min_len:
                min_len = len(tokens)
                best_tokens__ = [tokens]
            elif len(tokens) == min_len:
                best_tokens__.append(tokens)

        # Heuristic #5
        # Lowest ppl
        # If we still have ties we take the sentence with the lowest
        # Perplexity score according to the language model
        best_ppl = 10e20
        for tokens in best_tokens__:
            if len(tokens) > 1:
                inputs = self.vocab[tokens]
                hidden = self.language_model.begin_state(batch_size=1,
                                                         func=mx.nd.zeros,
                                                         ctx=self.ctx_nlp)
                output, _ = self.language_model(
                    mx.nd.array(inputs).expand_dims(axis=1).as_in_context(
                        self.ctx_nlp), hidden)
                output = output.softmax()
                l = 0
                for i in range(1, len(inputs)):
                    l += -output[i - 1][0][inputs[i]].log()
                ppl = (l / len(inputs)).exp()
                if ppl < best_ppl:
                    output_tokens = tokens
                    best_ppl = ppl
        output = ' '.join(self.detokenizer(output_tokens))

        # Heuristic #6
        # Sometimes there are artefact at the end of the corrected sentence
        # We cut the end of the sentence
        if len(output) > len(sentence) + 10:
            output = output[:len(sentence) + 2]
        return output
    def affinity_propagation(self,
                             entity_group: list,
                             metric: str = None,
                             damping: float = None,
                             preference: int = None,
                             embeddings: list = None,
                             entity_name: str = None,
                             selected_base_models: list = None):
        """
        In contrast to other traditional clustering methods, affprop does not require you to specify the number of
        clusters. In creators' terms, in affprop, each data point sends messages to all other points informing its
        targets of each target’s relative attractiveness to the sender. Each target then responds to all senders with a
        reply informing each sender of its availability to associate with the sender, given the attractiveness of the
        messages that it has received from all other senders. Senders reply to the targets with messages informing each
        target of the target’s revised relative attractiveness to the sender, given the availability messages it has
        received from all targets. The message-passing procedure proceeds until a consensus is reached. Once the sender
        is associated with one of its targets, that target becomes the point’s exemplar. All points with the same
        exemplar are placed in the same cluster.

        :param entity_group, company_names, locations, or  unknown_soup for everything else.
        :param metric, distance/similarity matrix - jaro or levenshtein. Will only needed when "graph representation"
         is being selected at constructor time
        :param damping, damps the responsibility and availability messages to avoid numerical oscillations when updating
         these messages.
        :param entity_name, useful for results file naming
        :param embeddings, list or embeddings in case of vector representation
        :param selected_base_models, need this for jason naming
        :param preference, Preferences for each point - points with larger values of preferences are more likely to be
        chosen as exemplars. The number of exemplars, ie of clusters, is influenced by the input preferences value.
        If the preferences are not passed as arguments, they will be set to the median of the input similarities.
        :return: clusters
        """
        words = np.asarray(entity_group)
        # graph representation aka string similarity - requires precombuted matrix of destances between strings
        # and "custom"  distance metric metric
        if self.representation == "graph_representation":
            affinity = "precomputed"
            selected_base_models = ''
            if metric == "jaro":
                distance_matrix = np.array([[jaro(w1, w2) for w1 in words]
                                            for w2 in words])
                features = distance_matrix
            elif metric == "levenshtein":
                similarity_matrix = -1 * np.array(
                    [[levenshtein(w1, w2) for w1 in words] for w2 in words])
                features = similarity_matrix
            else:
                raise SystemExit(
                    f"[ERROR]: function affinity_propagation() -> Provide one of the available metrics: "
                    f"['jaro', 'levenshtein']")
        else:
            # in this case we are dealing with embeddings; python currently supports only euclidean distance
            affinity = 'euclidean'
            metric = affinity
            features = embeddings
            if selected_base_models:
                selected_base_models = selected_base_models

        affprop = AffinityPropagation(affinity=affinity,
                                      damping=damping,
                                      preference=preference,
                                      random_state=None)
        affprop.fit(features)

        clusters = {}
        for cluster_id in np.unique(affprop.labels_):
            exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
            cluster = np.unique(
                words[np.nonzero(affprop.labels_ == cluster_id)])
            cluster_str = ", ".join(cluster)
            clusters.update({exemplar: cluster_str})

            if self.print_output:
                print(f"- **{exemplar}** --> {cluster_str}")

        if self.save_output:
            # name of json is dynamic - all parameter values are integrated in the name of the file itself
            with open(
                    f"{str(Path.cwd())}/results/{self.representation[0]}_affinity_"
                    f"{metric}_{str(damping)}_{preference}_{entity_name}_{selected_base_models}.json",
                    "w+") as out:
                json.dump(clusters, out, indent=4, sort_keys=True)

        return clusters
    def agglomerative(self,
                      entity_group: list = None,
                      metric: str = None,
                      linkage: str = None,
                      distance_threshold: float = None,
                      compute_full_tree: bool = True,
                      n_clusters: int = None,
                      embeddings: list = None,
                      entity_name: str = None,
                      selected_base_models: list = None):
        """
        In agglomerative algorithms, each item starts in its own cluster and the two most similar items are then
        clustered. We continue accumulating the most similar items or clusters together two at a time until
        there is one cluster.

        :param entity_group, name of the entity group (company name, location, unknown soup).
        :param metric, distance/similarity matrix - jaro or levenshtein. Will only needed when "graph representation"
         is being selected at constructor time
        :param linkage, {“ward”, “complete”, “average”, “single”}, default=”ward” .Which linkage criterion to use.
        The linkage criterion determines which distance to use between sets of observation. The algorithm will merge
        the pairs of cluster that minimize this criterion.
                * ward minimizes the variance of the clusters being merged.
                * average uses the average of the distances of each observation of the two sets.
                * complete or maximum linkage uses the maximum distances between all observations of the two sets.
                * single uses the minimum of the distances between all observations of the two sets.
        :param distance_threshold, float, default=None. The linkage distance threshold above which, clusters will not
        be merged. If not None, n_clusters must be None and compute_full_tree must be True.
        :param n_clusters, the number of clusters to find. It must be None if distance_threshold is not None.
        :param compute_full_tree, ‘auto’ or bool, default=’auto'. It must be True if distance_threshold is not None.
        By default compute_full_tree is “auto”, which is equivalent to True when distance_threshold is not None or that
         n_clusters is inferior to the maximum between 100 or 0.02 * n_samples. Otherwise, “auto” is equivalent to False
        :param embeddings
        :param entity_name, the name of the set - helps with json naming (optional)
        :param selected_base_models need this for jason naming
        :return: clusters
        """
        data = list(entity_group)
        data = np.asarray(data)

        # graph representation aka string similarity - requires precombuted matrix of destances between strings
        # and "custom"  distance metric metric
        if self.representation == "graph_representation":
            affinity = "precomputed"
            selected_base_models = ''
            if metric == "jaro":
                distance_matrix = np.array([[jaro(w1, w2) for w1 in data]
                                            for w2 in data])
                features = distance_matrix
            elif metric == "levenshtein":
                similarity_matrix = -1 * np.array(
                    [[levenshtein(w1, w2) for w1 in data] for w2 in data])
                features = similarity_matrix
            else:
                raise SystemExit(
                    f"[ERROR]: function affinity_propagation() -> Provide one of the available metrics: "
                    f"['jaro', 'levenshtein']")
        else:
            # in this case we are dealing with embeddings; python currently supports only euclidean distance
            affinity = metric
            metric = affinity
            features = embeddings
            if selected_base_models:
                selected_base_models = selected_base_models

        agg = AgglomerativeClustering(affinity=affinity,
                                      linkage=linkage,
                                      distance_threshold=distance_threshold,
                                      compute_full_tree=compute_full_tree,
                                      n_clusters=n_clusters)
        agg.fit(features)

        clusters = {}
        for idx, label in enumerate(agg.labels_):
            if label not in clusters.keys():
                clusters.update({int(label): [data[idx]]})
            else:
                clusters[int(label)].append(data[idx])

        if self.print_output:
            for key, item in clusters.items():
                print(key, item)
        if self.save_output:
            # name of json is dynamic - all parameter values are integrated in the name of the file itself
            with open(
                    f"{str(Path.cwd())}/results/{self.representation[0]}_agglomarative_"
                    f"{str(metric)}_{linkage}_{distance_threshold}_{compute_full_tree}_"
                    f"{entity_name}_{selected_base_models}.json", "w+") as out:
                json.dump(clusters, out, indent=4, sort_keys=True)

        return clusters
    def dbscan(self,
               entity_group: list = None,
               metric: str = None,
               epsilon: float = None,
               min_samples: int = None,
               embeddings: list = None,
               entity_name: str = None,
               selected_base_models: list = None):
        """
        Density-based clustering works by identifying “dense” clusters of points, allowing it to learn clusters of
        arbitrary shape and identify outliers in the data. The general idea behind ɛ-neighborhoods is given a data
        point, we want to be able to reason about the data points in the space around it. Formally, for some
        real-valued ɛ > 0 and some point p, the ɛ-neighborhood of p is defined as the set of points that are at most
        distance ɛ away from p. In 2D space, the ɛ-neighborhood of a point p is the set of points contained in a circle
         of radius ɛ, centered at p.

        :param entity_group, name of the entity group (company name, location, unknown soup).
        :param metric, distance/similarity matrix - jaro or levenshtein. Will only needed when "graph representation"
         is being selected at constructor time
        :param epsilon, ɛ, the radius (size) of the neighborhood around a data point p.
        :param min_samples, the minimum number of data points that have to be withing that neighborhood for a point
        to be considered a core point (of that given cluster ) - cluster density level threshold.
        :param embeddings, list or embeddings in case of vector representation
        :param entity_name, the name of the set - helps with json naming (optional)
        :param selected_base_models, need this for jason naming
        :return: clusters
        """
        words = list(entity_group)

        # graph representation aka string similarity - requires precomputed matrix of distances between strings
        # and "custom"  distance metric metric
        if self.representation == "graph_representation":
            affinity = "precomputed"
            selected_base_models = ''
            if metric == "jaro":
                distance_matrix = np.array([[jaro(w1, w2) for w1 in words]
                                            for w2 in words])
                features = distance_matrix
            elif metric == "levenshtein":
                similarity_matrix = np.array(
                    [[levenshtein(w1, w2) for w1 in words] for w2 in words])
                features = similarity_matrix
            else:
                raise SystemExit(
                    f"[ERROR]: function affinity_propagation() -> Provide one of the available metrics: "
                    f"['jaro', 'levenshtein']")
        else:
            # in this case we are dealing with embeddings; python currently supports only euclidean distance
            affinity = metric
            metric = affinity
            features = embeddings
            if selected_base_models:
                selected_base_models = selected_base_models

        model = DBSCAN(eps=epsilon, min_samples=min_samples, metric=affinity)

        features = np.array(features)
        model.fit(features)

        clusters = {}
        for idx, label in enumerate(model.labels_):
            if label not in clusters.keys():
                # got an error here, needs to be int, not int64 - hence the type cast to int
                clusters.update({int(label): [words[idx]]})
            else:
                clusters[int(label)].append(words[idx])

        if self.print_output:
            for key, item in clusters.items():
                print(key, item)

        if self.save_output:
            # name of json is dynamic - all parameter values are integrated in the name of the file itself
            with open(f"{str(Path.cwd())}/results/{self.representation[0]}_dbscan_"
                      f"{metric}_{str(epsilon)}_{str(min_samples)}_{entity_name}_{selected_base_models}.json", "w+") \
                    as out:
                json.dump(clusters, out, indent=4, sort_keys=True)

        return clusters
示例#16
0
def lev_metric(x, y):
    i, j = int(x[0]), int(y[0])
    return levenshtein(data[i], data[j])
示例#17
0
 def feature_NORMALIZATION(self, lnk):
     edit = levenshtein(unicode(lnk["label"]), unicode(lnk["text"]))
     return float(edit) / len(lnk["text"])
def _get_seq_match_score(seq_1, seq_2):
    """
    Can use any reasonable matching criteria. Using leven for now, rather
    overkill for what we need but it's k for now.
    """
    return leven.levenshtein(seq_1, seq_2)
示例#19
0
 def get_edit_distance(self, row, geoname):
     sn_name = row['NAME']
     leven_dist = levenshtein(geoname, sn_name)
     return leven_dist
示例#20
0
  
  rightNE=[]
  with open('/home/ahmad/duplicate-detection/eventregistrydata/pairs/StanfordNErightAttro2enesde_'+str(_c)+'.txt','r') as myfile:
    rightNE=myfile.readlines()
  
  leftNE=[item.replace("[u'","").replace("[","").replace("]","").replace("\', u\'"," ").replace("\'","").split() for item in leftNE]
  rightNE=[item.replace("[u'","").replace("[","").replace("]","").replace("\', u\'"," ").replace("\'","").split() for item in rightNE]
  for _i,_leftNE,_rightNE in zip(range(len(leftNE)),leftNE,rightNE):
    if Alllangpairs[n+_i][0]==Alllangpairs[n+_i][1]:
      score.append(-1)
      continue
    
    nintersection=0
    for _iteml in set(_leftNE):
      for _itemr in set(_rightNE):
        if 1.0 * levenshtein(_iteml, _itemr)/(len(_iteml)+len(_itemr))<0.25:
          nintersection+=1
    
    nunion=len(set(_leftNE))+len(set(_rightNE))
    
    if nunion>0:
      score.append(1.0*nintersection/nunion)
  
  n+=len(leftNE)


len(Alllangpairs),len(score),len(Allsentpairs),len(Allisdup_labels)
i=score.index(-1)
sum([True for _s, _l in zip(score, Alllangpairs[540000:540000+len(score)]) if _l[0]==_l[1] and _s==-1])
sum([True for _s in score if _s==-1])
示例#21
0
def checkAuthorMatch(article1,article2,authorName=None,
                     yearGap=2, nCommonAuths=3,
                     matchThresh=0.70, absMatchLimit=0.5,
                     titleMatchLimit=0.5, keywordMatchTol=2):
    """
    Check if two articles are probably from the same author.

    I think they are if:
    1) They are published within yearGap of eachother, from the same location
    or
    2) They share nCommonRefs or more
    or
    3) They share nCommonAuths or more
    or
    4) Author lists are >1 and have the same names.
    or
    5) Author of interest is spelled exactly the same, and they share 2+ keywords

    matchThresh sets the theshold for doing fuzzy string comparison with the
    affiliation names (so, 'Univeristy of Washington' will match
    'University of Washington, Astronomy Department'

    returns
    -------
    bool

    """
    result = False
    # If any of the criteria match, return True and bail out

    if authorName is None:
        authorName = authSimple(article1.author[0])

    # If author list is >1 and identical
    if len(article1.authorset) > 1:
        if article1.authorset == article2.authorset:
            return True

    # If published within yearGap from same location
    if hasattr(article1,'year') & hasattr(article2,'year'):
        if np.abs(int(article1.year)-int(article2.year)) <= yearGap:
            aff1 = None
            aff2 = None
            lDists = [levenshtein(unicode(authorName),unicode(authSimple(name))) for
                      name in article1.author]
            good = np.where(lDists == np.min(lDists))[0]
            # Can't tell which one is author, can't link it up.
            if np.size(good) > 1:
                return False
            if lDists[good] < 3:
                aff1 = affClean(article1.aff[good])
            lDists = [levenshtein(unicode(authorName),unicode(authSimple(name))) for
                      name in article2.author]
            good = np.where(lDists == np.min(lDists))[0]
            if np.size(good) > 1:
                return False
            if lDists[good] < 3:
                aff2 = affClean(article2.aff[good])
            if (aff1 is not None) & (aff2 is not None):
                if checkAffMatch(aff1,aff2,matchThresh=matchThresh):
                    return True

    # Check if they have enough common authors
    commonAuthors = article1.authorset.intersection(article2.authorset)
    if len(commonAuthors) >= nCommonAuths:
        return True

    # If the keywords are in common
    if (article1.keyword is not None) & (article2.keyword is not None):
        if len(set(article1.keyword) & set(article2.keyword)) > keywordMatchTol:
            # If the author is spelled exactly the same, and there are matching keywords
            a1 = [author for author in article1.author if authSimple(author) == authSimple(authorName)]
            a2 = [author for author in article2.author if authSimple(author) == authSimple(authorName)]
            if (len(a1) == 1) & (len(a2) ==1):
                # If the names match exactly
                if a1[0] == a2[0]:
                    return True

    return result
示例#22
0
def lev_dist(X): #returns the levenshtein distance between all pairs of a vector X
    D = np.zeros((len(X),len(X)))
    for row in range(len(X)):
        for col in range(len(X)):
            D[row][col] = levenshtein(X[row],X[col])
    return D