示例#1
0
def soft_tfidf(text, hypo):
    global tf
    #print(text)
    text = ' '.join(text)
    hypo = ' '.join(hypo)
    #print(text)
    text_resp = tf.transform([text])
    hypo_resp = tf.transform([hypo])

    valores_tf_idf_text = []
    valores_tf_idf_hypo = []

    features_names_text = []
    features_names_hypo = []

    feature_names = tf.get_feature_names()
    for col in text_resp.nonzero()[1]:
        #print feature_names[col], ' - ', text_resp[0, col]
        features_names_text.append(feature_names[col])
        valores_tf_idf_text.append(text_resp[0, col])
    #print("---------------")
    for col in hypo_resp.nonzero()[1]:
        #print feature_names[col], ' - ', hypo_resp[0, col]
        features_names_hypo.append(feature_names[col])
        valores_tf_idf_hypo.append(hypo_resp[0, col])

    valor = 0
    for i, word_t in enumerate(features_names_text):
        for j, word_h in enumerate(features_names_hypo):
            if jaro.jaro_winkler_metric(word_t, word_h) > 0.9:
                valor += jaro.jaro_winkler_metric(
                    (word_t),
                    (word_h)) * valores_tf_idf_text[i] * valores_tf_idf_hypo[j]
    #print(valor)
    return valor
        def jaro_winkler(__str1, __str2):
                def to_unicode(__str):
                        if isinstance(__str, unicode):
                                return __str
                        return unicode(__str)

                return jaro.jaro_winkler_metric(to_unicode(__str1), to_unicode(__str2))
示例#3
0
def create_similarity_matrix(sentences):
    '''
    The purpose of this function will be to create an N x N similarity matrix.
    N represents the number of sentences and the similarity of a pair of sentences
    will be determined through the Jaro-Winkler Score.
    
    params:
        sentences (List -> String) : This is a list of strings you want to create
                                     the similarity matrix with.
     
    returns:
        This function will return a square numpy matrix
    '''

    # identify sentence similarity matrix with Jaro Winkler score
    sentence_length = len(sentences)
    sim_mat = np.zeros((sentence_length, sentence_length))

    for i in range(sentence_length):
        for j in range(sentence_length):
            if i != j:
                similarity = jaro.jaro_winkler_metric(sentences[i],
                                                      sentences[j])
                sim_mat[i][j] = similarity
    return sim_mat
示例#4
0
def jwSim(v1, v2):
	try:
		return jaro.jaro_winkler_metric(v1.decode('utf-8').strip().lower(), v2.decode('utf-8').strip().lower())
	except:
		# print v1
		# print v2
		return 0.
示例#5
0
def jaro_w(word, topSimilarity):
    nw_candidates = {}
    for c in freqDict:
        similiarity = jaro.jaro_winkler_metric(word, c)
        if similiarity > 0.5:
            nw_candidates[c] = similiarity

    if len(nw_candidates) == 0:
        return [word]

    else:

        sorted_candidates_word_freq = sorted(nw_candidates.items(),
                                             key=lambda x: x[1],
                                             reverse=True)
        sorted_candidate_word = []

        for i in range(len(sorted_candidates_word_freq)):
            sorted_candidate_word.append(sorted_candidates_word_freq[i][0])

        words = []

        if topSimilarity:
            return sorted_candidate_word[0:min(candidates_nb,
                                               len(sorted_candidate_word) -
                                               1)]  # Les 3 premiers mots
        else:
            sorted_candidate_word = sorted_candidate_word[
                0:min(candidates_nb,
                      len(sorted_candidate_word) - 1)]
            for i in range(min(len(sorted_candidate_word), candidates_nb)):
                words.append(max(sorted_candidate_word, key=P))
                sorted_candidate_word.remove(words[i])
            return words
示例#6
0
def match_by_title(query, result_titles, correct_titles):
    import jaro

    # print '\n%s' % query
    # print '\n'.join(result_titles), '\n'
    # print '\n'.join(sorted(correct_titles)), '\n'

    # Apply lower case in all entries first
    correct_titles = map(unicode.lower, correct_titles)

    matched_ids = []
    for rtitle in result_titles:

        rtitle = rtitle.lower()

        best_jw = 0.0
        best_id = 0
        for i, ctitle in enumerate(correct_titles):
            jw = jaro.jaro_winkler_metric(unicode(rtitle), unicode(ctitle))
            if (jw > best_jw):
                best_jw = jw
                best_id = i

        # Check if we have a match
        if (best_jw >= 0.9):
            matched_ids.append(str(best_id))
        else:
            matched_ids.append('')

    # Create an incremental list of string ids for the correct
    # titles, since they are not in the database
    correct_ids = map(str, range(len(correct_titles)))

    return correct_ids, matched_ids
def findFamilyOrNot(surnames, row):
    result = []

    a = combinations(range(len(surnames)), 2)
    for check in a:
        result.append(
            jr.jaro_winkler_metric(surnames[check[0]], surnames[check[1]]))

    if len(result) == 0:
        if int(row['age']) >= 18:
            return "single_adult"
        else:
            return "single_child"
    elif max(result) > .86:
        return "family"
    else:
        return "probable family or not"
def findParenting(row, uiText=True):
    if not row['surnames']:
        return

    surnames = row['surnames']
    if len(surnames) < 2:
        return

    ages = list(map(int, row["roomies"].split(',')))
    ages_abs = [abs(int(x)) for x in ages]
    ages_under18 = [x for x in ages_abs if x < 18]
    age_surname_dict = dict(zip(ages_abs, surnames))

    parent_type = set()

    if (row['age'] >= 18) and (len(ages_under18) > 0):
        for child_age in ages_under18:
            if (jr.jaro_winkler_metric(row['surname'],
                                       age_surname_dict[child_age])) > .80:
                parent_type.add(parentingType(child_age, uiText))

    if len(parent_type):
        return parent_type.pop() if uiText else "/".join(parent_type)
示例#9
0
def jaro_winkler(s1: str, s2: str):
    dis = jaro.jaro_winkler_metric(s1, s2)
    # print(f'Jaro-Winkler: {dis}')
    return dis
import jaro

with open("password1_analysis.txt", "w") as writer:
    writer.write("actual, guess, jaro-winkler\n")
    total = 0.0
    number = 0.0
    with open("password1.txt", "r") as f:
        f.readline()
        for line in f.readlines():
            parts = line.split()
            actual = parts[1]
            guess = parts[2]

            #edit_distance = editdistance.eval(actual, guess)
            jaro_winkler = jaro.jaro_winkler_metric(unicode(actual), unicode(guess))
            total += jaro_winkler
            number += 1
            writer.write(actual + ", " + guess + ", " + str(jaro_winkler) + "\n")
    writer.write("Average: " + str(total/number))
示例#11
0
def recommendPaper(request):
    cats = [
        'astro-ph.GA',
        'astro-ph.CO',
        'astro-ph.EP',
        'astro-ph.HE',
        'astro-ph.IM',
        'astro-ph.SR',
        'cond-mat.dis-nn',
        'cond-mat.mes-hall',
        'cond-mat.other',
        'cond-mat.quant-gas',
        'cond-mat.soft',
        'cond-mat.stat-mech',
        'cond-mat.str-el',
        'cond-mat.supr-con',
        'gr-qc',
        'hep-ex',
        'hep-lat',
        'hep-ph',
        'hep-th',
        'math.MP',
        'nlin.AO',
        'nlin.CG',
        'nlin.CD',
        'nlin.SI',
        'nlin.PS',
        'nucl-ex',
        'nucl-th',
        'physics.acc-ph',
        'physics.app-ph',
        'physics.atom-ph',
        'physics.atm-clus',
        'physics.bio-ph',
        'physics.chem-ph',
        'physics.class-ph',
        'physics.comp-ph',
        'physics.data-an',
        'physics.flu-dyn',
        'physics.gen-ph',
        'physics.geo-ph',
        'physics.hist-ph',
        'physics.ins-det',
        'physics.med-ph',
        'physics.optics',
        'physics.ed-ph',
        'physics.soc-ph',
        'physics.plasm-ph',
        'physics.pop-ph',
        'physics.space-ph',
        'quant-ph',
        'math.AG',
        'math.AT',
        'math.AP',
        'math.CT',
        'math.CA',
        'math.CO',
        'math.AC',
        'math.CV',
        'math.DG',
        'math.DS',
        'math.FA',
        'math.GM',
        'math.GN',
        'math.GT',
        'math.GR',
        'math.HO',
        'math.IT',
        'math.KT',
        'math.LO',
        'math.MP',
        'math.MG',
        'math.NT',
        'math.NA',
        'math.OA',
        'math.OC',
        'math.PR',
        'math.QA',
        'math.RT',
        'math.RA',
        'math.SP',
        'math.ST',
        'math.SG',
        'cs.AI',
        'cs.CL',
        'cs.CC',
        'cs.CE',
        'cs.CG',
        'cs.GT',
        'cs.CV',
        'cs.CY',
        'cs.CR',
        'cs.DS',
        'cs.DB',
        'cs.DL',
        'cs.DM',
        'cs.DC',
        'cs.ET',
        'cs.FL',
        'cs.GL',
        'cs.GR',
        'cs.AR',
        'cs.HC',
        'cs.IR',
        'cs.IT',
        'cs.LO',
        'cs.LG',
        'cs.MS',
        'cs.MA',
        'cs.MM',
        'cs.NI',
        'cs.NE',
        'cs.NA',
        'cs.OS',
        'cs.OH',
        'cs.PF',
        'cs.PL',
        'cs.RO',
        'cs.SI',
        'cs.SE',
        'cs.SD',
        'cs.SC',
        'cs.SY',
        'q-bio.BM',
        'q-bio.CB',
        'q-bio.GN',
        'q-bio.MN',
        'q-bio.NC',
        'q-bio.OT',
        'q-bio.PE',
        'q-bio.QM',
        'q-bio.SC',
        'q-bio.TO',
        'q-fin.CP',
        'q-fin.EC',
        'q-fin.GN',
        'q-fin.MF',
        'q-fin.PM',
        'q-fin.PR',
        'q-fin.RM',
        'q-fin.ST',
        'q-fin.TR',
        'stat.AP',
        'stat.CO',
        'stat.ML',
        'stat.ME',
        'stat.OT',
        'stat.TH',
        'eess.AS',
        'eess.IV',
        'eess.SP',
        'eess.SY',
        'econ.EM',
        'econ.GN',
        'econ.TH',
        'physics',
        'astro-ph',
        'cond-mat',
        'nlin',
        'physics',
        'math',
        'cs',
        'CoRR',
        'q-bio',
        'q-fin',
        'stat',
        'eess',
        'econ'
    ]
    user = request.GET.get('user')
    try:
        obj = models.UserModel.objects.get(userName=user)
    except:
        return HttpResponse("invalid username")
    sums = []
    collectDict = ast.literal_eval(obj.collectDict)
    for (k,v) in collectDict.items():
        sums.append(v['summary'])
    sums = sums[:20]
    fields = []
    focusList = obj.focusList
    for f in focusList:
        if int(f)<154:
            fields.append(cats[int(f)-1])
    random.shuffle(fields)
    fields = fields[:1]
    papers = []
    for cat in fields:
        print(cat)
        url = "http://export.arxiv.org/api/query?search_query=cat:" + cat + "&sortBy=submittedDate&sortOrder=descending&max_results=10"
        res = requests.get(url)
        data = res.text
        soup = bs(data, 'lxml')
        entries = soup.find_all('entry')
        for entry in entries:
            newPaper = Paper()
            newPaper.id = entry.id.string
            newPaper.updatedTime = entry.updated.string
            
            newPaper.publishedTime = entry.published.string
            newPaper.title = entry.title.string
            newPaper.summary = entry.summary.string
            authors = entry.find_all('author')
            for author in authors:
                newPaper.author.append(author.contents[1].string)
            categories = entry.find_all('category')
            for category in categories:
                newPaper.category.append(category.get('term'))
            links = entry.find_all('link')
            for link in links:
                if link.get('title') == 'doi':
                    newPaper.doiLink = link.get('href')
                elif link.get('title') == 'pdf':
                    newPaper.pdfLink = link.get('href')
                else:
                    newPaper.paperLink = link.get('href')
            papers.append(newPaper)
    for paper in papers:
        for summary in sums:
            paper.score += jaro.jaro_winkler_metric(paper.summary, summary)
    papers = sorted(papers, key=attrgetter('score'))
    if len(papers) > 10:
        papers = papers[:10]
    res = dict()
    res['resCode'] = 200
    res['len'] = len(papers)
    res['papers'] = []
    for paper in papers:
        p = dict()
        p['author'] = ''
        for author in paper.author:
            p['author'] += (author+'/')
        p['id'] = paper.id
        p['updatedTime'] = paper.publishedTime
        p['title'] = paper.title
        p['summary'] = paper.summary
        p['category'] = ''
        for category in paper.category:
            p['category'] += (category+'/')
        p['doiLink'] = paper.doiLink
        p['paperLink'] = paper.paperLink
        p['pdfLink'] = paper.pdfLink
        res['papers'].append(p)
    res = json.dumps(res)
    return HttpResponse(res)
示例#12
0
def jwm(x, y):
    return jaro_winkler_metric(unicode(x.lower()), unicode(y.lower()))
示例#13
0
def get(folder, host, user, password, database, incremental_ind):

    def id_generator(size=25, chars=string.ascii_lowercase + string.digits):
        return ''.join(random.choice(chars) for _ in range(size))

    punctuation = "( + ) [ ? : ! . ; ] * # % ` ' / _ = -".split()
    punctuation.append('"')

    ###SETUP MAJOR VARS

    fdmain = folder+ "/location_disambiguation/"
    #need to figure out what this is

    #separate first(0) and incremental(1) disambiguations
    incremental = incremental_ind

    # Step 1
    mydb = MySQLdb.connect(host,
    user,
    password,
    database)
    cursor = mydb.cursor()

    if incremental == 0:
        increm = ''
    else:
        increm = ' AND (location_id is NULL or location_id = "")'   

    print "Step 1..."

    cursor.execute('select distinct country_transformed from rawlocation where country_transformed is not NULL and country_transformed != "" and country_transformed!="s" and country_transformed!="B." and country_transformed!="omitted" '+increm)


    countries = [item[0] for item in cursor.fetchall() if item[0] is not None]
    print countries

    
    os.makedirs(fdmain)
    os.makedirs(fdmain+'uspto_disamb/')
    os.makedirs(fdmain+'uspto_disamb_counts/')
    os.makedirs(fdmain+'uspto_disamb_v2/')
    os.makedirs(fdmain+'uspto_disamb_loc_latlong/')
    os.makedirs(fdmain+'uspto_disamb_only_loc/')


    for c in countries: 
        print c
        datum = {}
        output = open(fdmain+'uspto_disamb/'+c+'.tsv','wb')
        output2 = open(fdmain+'uspto_disamb_counts/'+c+'.tsv','wb')
        outp = csv.writer(output,delimiter='\t')

        outp2 = csv.writer(output2,delimiter='\t')
        cursor.execute("select city,state,country_transformed,count(city) from rawlocation where country_transformed = '"+c+"'"+increm+"  group by city,state order by count(city) desc")
        outp2.writerows(cursor.fetchall())
        cursor.execute('select distinct state from rawlocation where country_transformed = "'+c+'"'+increm)
        states = [f[0] for f in cursor.fetchall()]
        for s in states:
            if str(s) == 'None' or str(s)=='NULL':
                cursor.execute('select id,city from rawlocation where country_transformed = "'+c+'" and (state is NULL or state="NULL")'+increm)
                s = ''
            else:
                s = re.sub('[\n\t\f\r]+','',s.strip())
                cursor.execute('select id,city from rawlocation where country_transformed = "'+c+'" and state ="'+s+'"'+increm)
            locs = [list(f) for f in cursor.fetchall()]
            for l in locs:
                ll = []
                for l1 in l:
                    if l1:
                        ll.append(re.sub('[\n\t\r\f]+','',l1.strip()))
                    else:
                        ll.append('')
                outp.writerow(ll+[s,c])
        output.close()
        output2.close()
        
    print "Step 2..."
    fd = fdmain+'uspto_disamb_counts/'
    diri = os.listdir(fd)

    mastdata = {}
    mastdatum = {}
    for d in diri:
        #this is separate from the forloop below because otherwise places that are in the wrong file break it
        mastdata[d.replace('.tsv','')] = {}
        mastdatum[d.replace('.tsv','')] = {}
    for d in diri:
        input = open(fd+d,'rb')
        inp = csv.reader(input,delimiter='\t')
        try:
            head = inp.next()
            top = int(head[-1])
        except:
            pass
        num = 1
        for i in inp:
            num+=1
        inp = csv.reader(file(fd+d),delimiter='\t')
        for e,i in enumerate(inp):
            if e<=int(num/3) and int(i[-1])>int(top/5):
                city = unidecode(i[0])
                for p in punctuation:
                    city = city.replace(p,'')
                city = re.sub('[0-9]+','',city)
                city = re.sub('^\s+','',city)
                city = re.sub('\s+$','',city)
                city = city.replace(' ','')
                state = i[1]
                state = re.sub('^\s+','',state)
                state = re.sub('\s+$','',state)
                country = i[2]
                key = id_generator(size=12)
                try:
                    gg = mastdata[country][city.lower()+'_'+state.lower()]
                except:
                    #print len(mastdata[country])
                    mastdata[country][city.lower()+'_'+state.lower()] = [key,i[0].strip(),i[1].strip(),i[2],int(i[3])]
                    mastdatum[country][city.lower()] = [key,i[0],i[1].strip(),i[2].strip(),int(i[3])]

        input.close()

    print "Step 3..."
    # Step 3
    fd = fdmain+'uspto_disamb/'
    diri = os.listdir(fd)
    for d in diri:
        output = open(fdmain+'uspto_disamb_v2/'+d,'wb')
        input = open(fd+d,'rb')
        outp = csv.writer(output,delimiter='\t')
        inp = csv.reader(input,delimiter='\t')
        data = mastdata[d.replace('.tsv','')]
        datum = mastdatum[d.replace(".tsv",'')]
        secdata = {}
        secdatum = {}
        for i in inp:
            city = unidecode(i[1])
            state = i[2]
            country = i[3]
            for p in punctuation:
                city = city.replace(p,'')
            city = re.sub('[0-9]+','',city)
            city = re.sub('^\s+','',city)
            city = re.sub('\s+$','',city)
            origcity = city
            city = city.replace(' ','')
                
            try:
                gg = data[city.lower()+'_'+state.lower()]
                outp.writerow(i+gg)
            except:
                try:
                    cit = city.lower().split(",")[0]
                    gg = data[cit.lower()+'_'+state.lower()]
                    
                    outp.writerow(i+gg)
                except:
                    try:
                        cit = city.lower().split("/")
                        for cc in cit:
                            gg = data[cc.lower()+'_'+state.lower()]
                            outp.writerow(i+gg)
                            break
                    except:
                        try:
                            cit = city.lower().split("-")
                            for cc in cit:
                                gg = data[cc.lower()+'_'+state.lower()]
                                outp.writerow(i+gg)
                                break
                        except:
                            try:
                                cit = city.lower().split("&")[0]
                                gg = data[cit.lower()+'_'+state.lower()]
                            
                                outp.writerow(i+gg)
                            except:
                                try:
                                    gg = datum[city.lower()]
                        
                                    outp.writerow(i+gg)
                                except:
                                    try:
                                        
                                        howdy = 0
                                        
                                        for k,v in data.items():
                                                dist = jaro.jaro_winkler_metric((city.lower()+'_'+state.lower()).decode('utf-8','ignore'),k.decode('utf-8','ignore'))
                                                edit = nltk.edit_distance(city.lower()+'_'+state.lower(),k)
                                                if (re.search(k.split("_")[0],city.lower()) and k.split("_")[0]!='') or dist >= 0.95 or (edit==2 and len(city.lower())>5):
                                                    outp.writerow(i+v)
                                                    howdy = 1
                                                    break
                                            
                                        gg = datum[city]
                                    except:
                                        if howdy == 0:
                                            cit = [cc for cc in origcity.lower().split(" ") if len(cc) > 4]
                                            
                                            howdy2 = 0
                                            for cc in cit:
                                                try:
                                                    gg = datum[cc]
                                                    
                                                    outp.writerow(i+gg)
                                                    howdy2 = 1
                                                    break
                                                except:
                                                    pass
                                            
                                            if howdy2 == 0:
                                                try:
                                                    gg = secdata[city.lower()+'_'+state.lower()]
                                                    outp.writerow(i+gg)
                                                except:
                                                    try:
                                                        cit = city.lower().split(",")[0]
                                                        gg = secdata[cit.lower()+'_'+state.lower()]
                                                        outp.writerow(i+gg)
                                                    except:
                                                        try:
                                                            cit = city.lower().split("&")[0]
                                                            gg = secdata[cit.lower()+'_'+state.lower()]
                                                            outp.writerow(i+gg)
                                                        except:
                                                            try:
                                                                gg = secdatum[city.lower()]
                                                                outp.writerow(i+gg)
                                                            except:
                                                                try:
                                                                    howdy = 0
                                                                    gg = datum[city]
                                                                except:
                                                                    if howdy == 0:
                                                                        cit = [cc for cc in origcity.lower().split(" ") if len(cc) > 4]
                                                                        howdy2 = 0
                                                                        for cc in cit:
                                                                            try:
                                                                                gg = secdatum[cc]
                                                                                outp.writerow(i+gg)
                                                                                howdy2 = 1
                                                                                break
                                                                            except:
                                                                                pass
                                                                        if howdy2 == 0:
                                                                            key = id_generator(size=12)
                                                                            secdata[city.lower()+'_'+state.lower()] = [key,i[1],i[2],i[3]]
                                                                            secdatum[city.lower()] = [key,i[1],i[2],i[3]]
                                                                            outp.writerow(i+[key,i[1],i[2],i[3]])
        input.close()
        output.close()
                                    
    print "Step 4..."
    #Step 4
    fd = fdmain+'uspto_disamb_v2/'
    fd3 = fdmain+'uspto_disamb_only_loc/'
    diri = os.listdir(fd)

    for d in diri:
        input = open(fd+d,'rb')
        output = open(fd3+d,'wb')
        inp = csv.reader(input,delimiter='\t')
        outp2 = csv.writer(output,delimiter='\t')
        data = {}
        final = {}
        disamb = {}
        for i in inp:
            try:
                gg = data[' '.join(i[5:])]
                final[i[0]] = i[:4]+[gg]+i[5:]
            except:
                try:
                    data[' '.join(i[5:])] = i[4]
                    final[i[0]] = i
                    disamb[i[4]] = i[4:]
                except:
                    print d,i
        input.close()
        for k,v in disamb.items():
            if len(v) == 5:
                v = v[:-1]
            outp2.writerow(v)
        output.close()

    #exit()
    print "Done Step 1 - 4"