-
Notifications
You must be signed in to change notification settings - Fork 0
/
similarity.py
167 lines (133 loc) · 6.35 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from tweetTransform import removeFile
#from geopy.distance import vincenty
from numpy import ndarray
import numpy as np
from cmath import log10
from math import fabs
###############################################################################
def distanceMedoid(tweetAttributeFileName, medoidFileName, distanceFileName):
print("Counting distance from medoids using %s %s %s" % (tweetAttributeFileName,medoidFileName, distanceFileName) )
previousDistances = []
# read existing distances
try:
distanceFile = open(distanceFileName, 'r')
for line in distanceFile:
previousDistances.append(line.replace('\n', ''))
distanceFile.close()
except IOError:
print( "%s no previous data" % distanceFileName )
distanceFile = open(distanceFileName, 'w')
tweetAttributeFile = open(tweetAttributeFileName, 'r')
medoidFile = open(medoidFileName, 'r')
attributesLine = medoidFile.readline() # ignore attributes names
# read medoids to 2d array : groups/attributes
groupCollection = []
groupIdx = 0
for line in medoidFile:
#skip row name
line = line.split(' ', 1)[1]
#split attributes
featureCollection = line.split(' ')
medoidCollection = ndarray((len(featureCollection),1),float)
# get medoid of each attribute to column. each row is a group
for attributeIdx in range(0, len(featureCollection)):
medoidCollection[attributeIdx] = float(' '.join(featureCollection[attributeIdx].split()))
groupIdx += 1
groupCollection.append(medoidCollection)
#print(groupCollection)
# count distances to medoids
lineIdx = 0
maxDist = 0.0
minDist = float('inf')
currentDistances = []
for line in tweetAttributeFile:
#skip row name
line = line.split(' ', 1)[1]
#split attributes
featureCollection = line.split(' ')
attributes = ndarray((len(featureCollection),1),float)
# calc distance of each attribute of a row (sample tweet) from each medoid column
for attributeIdx in range(0, len(featureCollection)):
attributes[attributeIdx] = float(' '.join(featureCollection[attributeIdx].split()))
# calc distance from given point to each medoid
distanceVec = ndarray((len(groupCollection),len(featureCollection)),float)
for i in range(0, len(groupCollection)):
distanceVec[i] = np.linalg.norm(attributes-groupCollection[i])
#print(distanceVec)
distance = 0.0
# TODO here we try to not have circles around a europs medoid. instead try to break it
for i in range(0, len(distanceVec)):
for j in range(0, len(attributes)):
multitude = 1 #+ 2*j
if (j % 2 == 0) and (attributes[j] > groupCollection[i][j]):
multitude += 3 #* (attributes[j] - groupCollection[i][j])
if (j % 2 == 1) and (attributes[j] <= groupCollection[i][j]):
multitude += 3 #* (groupCollection[i][j]-attributes[j])
#multitude = 1 # this line makes the groups circular again
distance += float(multitude)*log10(distanceVec[i][j]+1) # need to move from zero a bit for log
real = float(0)
if isinstance(distance, complex) and distance.imag != 0:
print ("Distance: (%s) is complex at line %d (%d,%d)"%(distance, lineIdx, i, j))
raise TypeError()
elif isinstance(distance, complex) and distance.imag == 0:
real = distance.real
currentDistances.append(real)
else:
real = distance
currentDistances.append(distance)
#gathering min and max to use for normalization
if real > maxDist:
maxDist = real
if real < minDist:
minDist = real
lineIdx += 1
lineIdx = 0
print "previousDistances len is %d" % len(previousDistances)
print "currentDistances len is %d" % len(currentDistances)
prevDistLen = len(previousDistances)
for distance in currentDistances:
#every written distance will be normalized
if prevDistLen == 0:
# need to have a first column with no data
distanceFile.write("nothing " + str((distance*32 - minDist*32)/maxDist)+"\n")
else:
if prevDistLen <= lineIdx:
print "line index is %d "%lineIdx
raise IndexError()
distanceFile.write(previousDistances[lineIdx] + " " + str((distance*32 - minDist*32)/maxDist)+"\n")
lineIdx += 1
tweetAttributeFile.close()
medoidFile.close()
distanceFile.close()
###############################################################################
def distanceSqrLongPlusLat(tweetAttributeFileName, distanceFileName):
print("Counting distance by long^2+lat using %s %s" % (tweetAttributeFileName, distanceFileName) )
tweetAttributeFile = open(tweetAttributeFileName, 'r')
removeFile(distanceFileName)
distanceFile = open(distanceFileName, 'w')
# count distances
for line in tweetAttributeFile:
#skip row name
line = line.split(' ', 1)[1]
#split attributes
featureCollection = line.split(' ')
attributes = ndarray((len(featureCollection),1),float)
# calc distance of each attribute of a row (sample tweet) from each medoid column
for attributeIdx in range(0, len(featureCollection)):
attributes[attributeIdx] = float(' '.join(featureCollection[attributeIdx].split()))
attributes[1] += 40 # make longitude on + side
#if attributes[0] > 40: # try to chnage strips to checkers
# attributes[0] = -1*attributes[0]
distance = attributes[1]*attributes[1] + attributes[0]
distanceFile.write("nothing " + str(distance)+"\n")
tweetAttributeFile.close()
distanceFile.close()
###############################################################################
def similarityCoord(summaryParsedCoord, summarySimilarityCoord):
print( "Counting distances of coords..." )
removeFile(summarySimilarityCoord)
# run C code for similarity
from ctypes import cdll
lib = cdll.LoadLibrary('./cmake_tfidf/libtfidf.so')
lib.CountCoordinateSimilarity(summaryParsedCoord, summarySimilarityCoord)
###############################################################################