Exemplo n.º 1
0
def check_car_make(name):
    name = name.strip()
    name = name.lower()
    name = name.replace("`", "")
    name = name.replace(".", " ")
    name = name.replace("0", "0")
    lst = [
        "volvo", "chrylsr", "datsun", "chevy", "chevrolet", "ford",
        "volkswagen", "vw", "buick", "mercury", "dodge", "subaru", "mazda",
        "audi", "honda", "hyundai", "plymth", "lincoln", "toyota", "renault",
        "peugot", "nissan", "isuzu", "cadillac", "yamaha", "jeep", "saab",
        "yamaha", "porshe", "oldsmob", "pontac", "ferrari", "mitsubi", "eagle",
        "jaguar", "camaro"
    ]
    if name == "-":
        return "other"
    for item in lst:
        if ld(name, item) <= 2:
            return "car_make"
    return "other"

    if name == "null":
        return "other"
    for item in ls:
        if item in name:
            return "neighborhood"
    return "other"
Exemplo n.º 2
0
def check_color(word, lclr):
    word = word.strip()
    word = word.lower()
    if word == "null":
        return "other"
    if word == '-':
        return "other"
    for item in lclr:
        if ld(word, item) <= 1:
            return "color"

    return "other"
Exemplo n.º 3
0
def homophone_matches(terms, targets):
    # Given two lists, return intersection (with lenience for homophones).
    excluded_words = ['vs']
    targets = [t for t in targets if t not in excluded_words]
    orig_targets = targets
    terms = map(unicode, terms)
    targets = map(unicode, targets)
    matches = []
    for i in range(len(terms)):
        for j in range(len(targets)):
            if ld(terms[i], targets[j]) <= 1:
                matches.append(orig_targets[j])
    return matches
Exemplo n.º 4
0
def homophone_matches(terms, targets):
    # Given two lists, return intersection (with lenience for homophones).
    excluded_words = ['vs']
    targets = [t for t in targets if t not in excluded_words]
    orig_targets = targets
    terms = map(unicode, terms)
    targets = map(unicode, targets)
    matches = []
    for i in range(len(terms)):
        for j in range(len(targets)):
            if ld(terms[i], targets[j]) <= 1:
                matches.append(orig_targets[j])
    return matches
Exemplo n.º 5
0
def get_clusters(dic,prec):
  clusters = defaultdict(lambda:[0,0,[]])
  l =  sorted(dic, key=dic.get, reverse=True)[:len(dic)/5]
  for i,e1 in enumerate(l):
    docs = dic[e1][1]
    clusters[e1][0]+=dic[e1][0]
    for e2 in l[i+1:]:
      if e1!=e2 and e1[0]==e2[0] and ld(e1,e2) < prec:
        clusters[e1][0]+=dic[e2][0]
        docs = docs|dic[e2][1]
        clusters[e1][2].append(e2)
        l.remove(e2)
    clusters[e1][1]+=len(docs)
  return clusters
Exemplo n.º 6
0
def check_body(name):
    name = name.strip()
    name = name.lower()
    name = name.replace("&","")
    name = name.replace("and","")
    name = name.replace("/"," ")
    name = name.split(" ")
    lst = ["wagon", "truck", "sedan", "bike","bus","pick","taxi",
           "van","cycle","cab","motor","door","moped","conv",
           "garbage","mixer","ambul","passenger","tank","flat","bed","wheel",
           "limo","vehicle","dump","train","delv","subn","dsd","util","refg","trlr","pkup","semi"]
    for n in name:
        for item in lst:
            if ld(n, item)<=2:
                return "vehicle_type"
    return "other"
Exemplo n.º 7
0
def check_name(name, content):
    if name == "null" or name == "-":
        return "other"
    name = name.lower()
    name = name.replace("&", "")
    name = name.replace("and", "")
    name = name.replace("/", " ")
    name = name.replace("-", "")
    name = name.strip()
    name = name.split(",")
    if len(name[0]) == 1:
        return "person_name"
    for nm in content:
        for n in name:
            n = n.strip()
            if ld(n, nm) <= 3:
                return "person_name"
    return "other"
Exemplo n.º 8
0
# from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, StringIndexer, NGram
# from pyspark.ml.classification import LogisticRegression
# from pyspark.ml import Pipeline
# from pyspark.mllib.evaluation import MulticlassMetrics

#spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = SparkContext("local", pyFiles=["jlf.zip"])
spark = SparkSession \
    .builder \
    .appName("Big Data Project") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
#sc = SparkContext()
#sc.addPyFile("jellyfish")
from jellyfish import levenshtein_distance as ld, soundex
print("This is starting : ", ld("anjan", "anjana"))
"""
List of functions written for semantic type : 
1. check_park
2. check_agency
3. check_subject
4. check_street
5. check_address
6. check_schLvl
7. check_website
8. check_build_cls
9. check_zipcode
10. check_school_name
11. check_borough
12. check_phoneNum
13. check_color
Exemplo n.º 9
0
                    # print "%s" % (str(bin(shortWordBits))[2:])
                    # print "%s" % (str(bin(compareWordsBits[i]))[2:])
                    # print "Difference: %d" % bitCount(shortWordBits ^ compareWordsBits[i]) 
                counter = counter + 1

print "---------------------"
print "Bit Stage:  %s seconds" % t.secs
print "Number of Short Words: %d" % len(banknamesShort)
print "Number of Long Words: %d" % len(banknamesLong)
print "Theoretical Number of Matches (short x long): %d" % (len(banknamesShort) * len(banknamesLong))
print "Number of Total Comparisons: %d" % counter

matchCount =[]
with Timer() as t:
    for shortWord, compareWordsList in compareDict.items():
        compareVals = [ld(shortWord, longWord) for longWord in compareWordsList]
        minVal = min(compareVals)
        if minVal <= threshold:
            matches = [word for dist, word in zip(compareVals, compareWordsList) if dist == minVal]
            matchCount.append(len(matches))
            #matchString = ", ".join(matches)
            #print "Potential Matches for %s: %s, distance %d" % (shortWord, matchString, minVal)
        #else:
        #    print "No matches for %s" % shortWord

avgMatches = float(len(matchCount))/len(compareDict.keys())
fracMatched = float(len(matchCount))/len(banknamesShort)
print "---------------------"
print "LD Stage: %s seconds" % t.secs
print "Number of Comparisons: %d" % successCounter
print "Average Number of Matches: %f" % avgMatches
Exemplo n.º 10
0
                    # print "Difference: %d" % bitCount(shortWordBits ^ compareWordsBits[i])
                counter = counter + 1

print "---------------------"
print "Bit Stage:  %s seconds" % t.secs
print "Number of Short Words: %d" % len(banknamesShort)
print "Number of Long Words: %d" % len(banknamesLong)
print "Theoretical Number of Matches (short x long): %d" % (
    len(banknamesShort) * len(banknamesLong))
print "Number of Total Comparisons: %d" % counter

matchCount = []
with Timer() as t:
    for shortWord, compareWordsList in compareDict.items():
        compareVals = [
            ld(shortWord, longWord) for longWord in compareWordsList
        ]
        minVal = min(compareVals)
        if minVal <= threshold:
            matches = [
                word for dist, word in zip(compareVals, compareWordsList)
                if dist == minVal
            ]
            matchCount.append(len(matches))
            #matchString = ", ".join(matches)
            #print "Potential Matches for %s: %s, distance %d" % (shortWord, matchString, minVal)
        #else:
        #    print "No matches for %s" % shortWord

avgMatches = float(len(matchCount)) / len(compareDict.keys())
fracMatched = float(len(matchCount)) / len(banknamesShort)