Exemplo n.º 1
0
def parseJSON():
    data = util.loadJSON(constants.JSON_FILE)
    data_index = []
    for obj in data:
        data_dict = dict()
        temp_text = util.removePunctuation(str(data[obj]['text']))
        stopped_temp_text = util.removeStopWords(temp_text, constants.STOP_LIST)
        temp_length = len(temp_text.split(" "))
        data_dict['text'] = temp_text.lower()
        data_dict['doc_length'] = temp_length
        data_dict['doc_length_stopped'] = len(stopped_temp_text.split(" "))

        meta_data = {
            "index": {
                "_index": constants.INDEX_NAME,
                "_type": constants.TYPE_NAME,
                "_id": str(obj)
            }
        }

        data_index.append(meta_data)
        data_index.append(data_dict)

    print "Complete JSON parsed..."
    return data_index
Exemplo n.º 2
0
def createMetaData(review_data, userListMap):
  globalBusinessMap = {}
  for review in review_data:
    userID = review['user_id']
    if userID in userListMap:
      reviewID = str(review['review_id'])
      businessID = str(review['business_id'])
      if businessID not in userListMap[userID]['restaurantMap']:
        userListMap[userID]['restaurantMap'][businessID] = []
      userListMap[userID]['restaurantMap'][businessID].append(reviewID)
      userListMap[userID]['reviewMap'][reviewID] = (review['stars'], review['date'])
    if businessID not in globalBusinessMap.keys():
      globalBusinessMap[businessID] = [userID]
      #print 'business added!'
    elif userID not in globalBusinessMap[businessID]:
      globalBusinessMap[businessID].append(userID)
      #print 'user added!'

  business_data = util.loadJSON('../yelp/restaurants.JSON')
  for user_id, user in userListMap.items():
    # print user
    restaurantSet = set(user['restaurantMap'].keys())
    # print restaurantSet
    calculateAttrPref(user, restaurantSet, business_data)

  user_map_file = open('userMap.json', 'w')
  json.dump(userListMap, user_map_file, indent=4)
  user_map_file.close()

  business_map_file = open('globalBusinessMap.json', 'w')
  json.dump(globalBusinessMap, business_map_file, indent=4)
  business_map_file.close()
Exemplo n.º 3
0
def getModel(num_categories, split, get_names=False):
    bucket_width = 4.0 / num_categories
    buckets = [1 + bucket_width * (i + 1) for i in range(num_categories - 1)]
    buckets.append(5)
    if get_names:
        train_inputs, train_labels, ingredient_list = loadJSON(input_file,
                                                               split,
                                                               buckets=buckets,
                                                               get_names=True)
        model = train(train_inputs, train_labels, num_categories)
        return model, ingredient_list, buckets, bucket_width

    train_inputs, train_labels, test_inputs, test_labels = loadJSON(
        input_file, split, buckets=buckets)
    model = train(train_inputs, train_labels, num_categories)
    return model, test_inputs, test_labels
Exemplo n.º 4
0
def downloadData(param: Param, download: bool = True):
    '''Download user data (if {download} is True) to json files, merge them into a flat pandas.DataFrame, and write it to disk.'''
    logging.info(f"{param.filePath().name.replace('.','|')}")
    if download:
        subMethod = param.splitMethod(lower=True)
        for f in param.filePath(glob='*json'):
            f.unlink()
        pbarManager = enlighten.get_manager()
        with pbarManager.counter(unit='page', leave=False) as pbar:
            while param.page <= param.nPages:
                fileName = param.filePath(ext=f'.{param.page:04d}.json')
                response = getReq(param=param,
                                  pbarManager=pbarManager,
                                  collapse=False)
                param.page = int(
                    response.get(subMethod).get('@attr').get('page'))
                param.nPages = int(
                    response.get(subMethod).get('@attr').get('totalPages'))
                pbar.total = param.nPages  # [tqdm: update total without resetting time elapsed](https://stackoverflow.com/a/58961015/13019084)
                pbar.update()
                param.filePath().parent.mkdir(exist_ok=True)
                with open(file=fileName, mode='w') as jsonF:
                    json.dump(obj=response, fp=jsonF)
                param.page += 1
                time.sleep(param.sleep)
        pbarManager.stop()
    DF = loadJSON(param)
    df = flattenDF(param=param, DF=DF, writeToDisk=True)
    if param.splitMethod() in ['TopArtists', 'TopAlbums', 'TopTracks']:
        writeCSV(param=param, df=df)
Exemplo n.º 5
0
def createFoodNetworkEdgeLists(userMapFile, reviewJSONFile, thresholdJaccard, thresholdRating, thresholdAttr, numUsers):
  userListMap = pickle.load( open( userMapFile, "rb" ) )
  review_data = util.loadJSON(reviewJSONFile)

  # 1) add meta-data
  for userID in userListMap.keys():
    userListMap[userID]['restaurantMap'] = {}  # create empty restaurantMap for each user
    userListMap[userID]['reviewMap'] = {} # create empty reviewMap for each user
  createMetaData(review_data, userListMap)
  print 'number of users', len(userListMap)
  print userListMap

  # 2) calculate scores for each edge
  jaccardVals = createEdgeVals(userListMap, True, False, False)
  attrVals = createEdgeVals(userListMap, False, True, False)
  ratiVals = createEdgeVals(userListMap, False, False, True)

  # 3) create food network for each score type
  jaccardNtwkFile = 'food_ntwk_random/jacc_edge_list_{}users_{}.txt'.format(numUsers, thresholdJaccard)
  ratiNtwkFile = 'food_ntwk_random/rati_edge_list_{}users_{}.txt'.format(numUsers, thresholdAttr)
  attrNtwkFile = 'food_ntwk_random/attr_edge_list_{}users_{}.txt'.format(numUsers, thresholdRating)
  createFoodNetwork(jaccardVals, userListMap, jaccardNtwkFile, thresholdJaccard)
  createFoodNetwork(ratiVals, userListMap, ratiNtwkFile, thresholdRating)
  createFoodNetwork(attrVals, userListMap, attrNtwkFile, thresholdAttr)

  # 4) check if valid edge list and print info about each network
  g = snap.LoadEdgeList(snap.PUNGraph, jaccardNtwkFile, 0, 1)
  print 'Jaccard Network: Num Nodes = {}, Num Edges = {}'.format(g.GetNodes(), g.GetEdges())
  g = snap.LoadEdgeList(snap.PUNGraph, attrNtwkFile, 0, 1)
  print 'Attribute Network: Num Nodes = {}, Num Edges = {}'.format(g.GetNodes(), g.GetEdges())
  g = snap.LoadEdgeList(snap.PUNGraph, ratiNtwkFile, 0, 1)
  print 'Rating Network: Num Nodes = {}, Num Edges = {}'.format(g.GetNodes(), g.GetEdges())
Exemplo n.º 6
0
 def get_email(query):
     names = util.loadJSON("contacts.json")
     toField = None
     for x in range(len(names)):
         if names[x]["name"] in query:
             toField = names[x]["email"]
             break
     return toField
Exemplo n.º 7
0
def createUsersToBizMap():
  review_data = util.loadJSON('../yelp/review.json')
  restaurantMap = load_data('../yelp/restaurants.json')
  UsersToBizMap = {}
  # userID: set(bizID1, bizID2, ...)   --> maps every user that's been to every restaurant
  for review in review_data:
    review_userID = review['user_id']
    businessID = str(review['business_id'])
    if businessID not in restaurantMap:
      continue
    if businessID not in UsersToBizMap[businessID]:
      UsersToBizMap[review_userID] = set()
    UsersToBizMap[review_userID].add(businessID)
  return UsersToBizMap
Exemplo n.º 8
0
def writeEmbeddingFiles():
    if ".txt" in input_file:
        train_inputs, tokens = loadTxt(input_path_prefix + input_file)
    elif ".json" in input_file:
        train_inputs, tokens = loadJSON(input_path_prefix + input_file)

    embeddings = train(train_inputs)

    stem = input_file[:input_file.find('.')]
    np.savetxt(result_path_prefix + stem + "_embeddings_%d" % dim, embeddings)

    fn = result_path_prefix + stem + "_tokens"
    if not os.path.isfile(fn):
        f = open(fn, 'w+')
        f.write(str(tokens))
        f.close()
Exemplo n.º 9
0
def writeEmbeddingFiles(dims):
	if ".txt" in input_file:
		train_inputs, tokens = loadTxt(input_path_prefix + input_file)
	elif ".json" in input_file:
		train_inputs, tokens = loadJSON(input_path_prefix + input_file)

	for hidden_size in dims:
		embeddings = trainCBOW(train_inputs, hidden_size=hidden_size)

		stem = input_file[:input_file.find('.')]
		np.savetxt(result_path_prefix + stem + "_CBOWembeddings_%d_%depochs" %(hidden_size, max_epochs), embeddings)

		fn = result_path_prefix + stem + "_tokens"
		if not os.path.isfile(fn):
			f = open(fn, 'w+')
			f.write(str(tokens))
			f.close()

	return embeddings, tokens
Exemplo n.º 10
0
def write_edges_file(user_data, user_map, edgesFile, edgesWithScoreFile):
  # UsersToBizMap = createUsersToBizMap()
  for userID in user_map.keys():
    user_map[userID]['restaurantMap'] = {}
    user_map[userID]['reviewMap'] = {}
  createMetaData(util.loadJSON('../yelp/review.json'), user_map)
  friend_list = []
  file = open(edgesFile, "w")
  file2 = open(edgesWithScoreFile, "w")

  scoreMap = {}
  count = 0

  for i in xrange(len(user_data)):
    friend_list.append(user_data[i]['friends'])
    # print friend_list[i]
    for j in xrange(len(friend_list[i])):
      if friend_list[i][j] in user_map.keys():
        pair = (user_data[i]['user_id'], friend_list[i][j])
        pair2 = (friend_list[i][j], user_data[i]['user_id'])
        if pair in scoreMap:
          score = scoreMap[pair]
        elif pair2 in scoreMap:
          score = scoreMap[pair2]
        else:
          score = getRatingSimScore(friend_list[i][j], user_data[i]['user_id'], user_map) # if user should be in the network
        if score > 0:
          src_tmp = user_map[user_data[i]['user_id']] # not quite sure what data type this return value is, but it's converted to string 
          dst_tmp = user_map[friend_list[i][j]]       # not quite sure what data type this return value is, but it's converted to string 
          src = src_tmp['node_id'] # extract node_id for src of edge
          dst = dst_tmp['node_id']
          # src = str(src_tmp)[12:-1] # extract node_id for src of edge
          # dst = str(dst_tmp)[12:-1] # extract node_id for dst of edge
          line = "{0} {1}\n".format(src, dst) 
          lineWithScore = "{0} {1} {2}\n".format(src, dst, score) 
          file.write(line)
          file2.write(lineWithScore)
          count += 1
  print 'num edges', count
  file.close()
  file2.close()
Exemplo n.º 11
0
def process():
    global state
    global ins_stack
    global com_stack
    global out_stack

    result = ""
    query = request.args.get("query").lower()

    if state == 0:
        command = None
        funcs = util.loadJSON("cmd.json")
        for f in funcs:
                if funcs[f]["trigger"] in query:
                    command = funcs[f]
                    break
        if command == None:
            if query == "":
                return "Hello."
            elif "reset" in query:
                util.writeJSON("cmd.json", (util.loadJSON("backup.json")))
                micro.micro_states = {}
            else:
                return "I couldn't understand."
        else:
            ins_stack.append(command["id"])
            if (command["internal"] == True):
                state = 1
            else:
                x = 0
                t_i_s = [ins_stack.pop()]
                t_c_s = []
                while x < len(t_i_s):
                    while internal(t_i_s[x]) == False:
                        tuple = breakdown(t_i_s, x, funcs)
                        t_i_s = tuple[0] + t_i_s
                        t_c_s = tuple[1] + t_c_s
                    x += 1
                ins_stack = t_i_s
                com_stack = t_c_s
                state = 2

    if state == 1:
        feedback = call(ins_stack[0], query, out_stack)
        if (feedback[1] == True):
            ins_stack.pop(0)
            state = 0
            out_stack.append(feedback[0])
        result = feedback[0]
    elif state == 2:
        feedback = call(ins_stack[0], com_stack[0], out_stack)
        while  (feedback[1] == True and len(ins_stack) > 0):
            ins_stack.pop(0)
            com_stack.pop(0)
            out_stack.append(feedback[0])
            if len(ins_stack) > 0:
                feedback = call(ins_stack[0], com_stack[0], out_stack)
        result = feedback[0]

        if (len(ins_stack) == 0 and feedback[1] == True):
            state = 0
            
    return result
Exemplo n.º 12
0
def teach_command(command_string, out_stack):
    print("DEBUG" + command_string)
    frame = inspect.currentframe()
    state_key = str(inspect.getframeinfo(frame).function)
    DEFAULT_STATE = (None, [])
    if state_key not in micro_states:
        m1 = re.search("(.+)called (.+)", command_string)
        if m1 is None:
            micro_states[state_key] = DEFAULT_STATE
            return "What should I call this?", False
        else:
            command_name = m1.group(2)
            micro_states[state_key] = (command_name, [])
            cmds = util.loadJSON("cmd.json")
            conflict = False
            for x in cmds:
                x_trigger = cmds[x]["trigger"]
                if x_trigger in command_name or command_name in x_trigger:
                    conflict = True
            if conflict:
                del micro_states[state_key]
                return "Invalid command name.", True
            return "How do I do this?", False
    else:
        if "cancel" in command_string:
            del micro_states[state_key]
            return "Teaching cancelled", True
        current_state = micro_states[state_key]
        if current_state[0] is None:
            command_name = command_string
            cmds = util.loadJSON("cmd.json")
            conflict = False
            for x in cmds:
                x_trigger = cmds[x]["trigger"]
                if x_trigger in command_name or command_name in x_trigger:
                    conflict = True
            if conflict:
                del micro_states[state_key]
                return "Invalid command name.", True
            micro_states[state_key] = (command_name, [])
            return "How do I do this?", False
        else:
            if "finish" in command_string:
                if len(micro_states[state_key][1]) > 0:
                    seq_id = [i[0] for i in micro_states[state_key][1]]
                    seq_str = [i[1] for i in micro_states[state_key][1]]
                    util.saveFunction(micro_states[state_key][0], seq_id,
                                      seq_str)
                    del micro_states[state_key]
                    return "New command saved", True
                else:
                    del micro_states[state_key]
                    return "Incorrect number of steps", True
            else:
                command_step = command_string
                cmds = util.loadJSON("cmd.json")
                step_id = None
                step_str = command_string
                for x in cmds:
                    x_trigger = cmds[x]["trigger"]
                    if x_trigger in command_step:
                        if x_trigger == "teach":
                            continue
                        x_id = cmds[x]["id"]
                        step_id = x_id
                        break
                if step_id is None:
                    return "Sorry. I didn't get that", False
                else:
                    micro_states[state_key][1].append((step_id, step_str))
                    return "OK. What next?", False
Exemplo n.º 13
0
import cv2
import numpy as np
import util as u

#undistort het camerabeeld met de opgeslagen matrix in:
distortionData = u.loadJSON(
    r"C:\Users\reuli\Documents\Bachelor Assignment\Robot project\python\calibration\calibration"
)


#undistort het camerabeeld gray
def undistort(gray):
    dst = cv2.undistort(gray, np.float32(distortionData["cameraMatrix"]),
                        np.float32(distortionData["distCoeffs"]), None,
                        np.float32(distortionData["newCameraMatrix"]))

    x, y, w, h = distortionData["validPixROI"]
    dst = dst[y:y + h, x:x + w]
    return dst
Exemplo n.º 14
0
import cv2
import numpy as np
import util as u

transformData = u.loadJSON(
    r"C:\Users\reuli\Documents\Bachelor Assignment\Robot project\python\calibration\transformationMatrix"
)


#perspective transforms image
def transform(gray):
    warped = cv2.warpPerspective(
        gray, np.float32(transformData["transform_matrix"]),
        (transformData["maxWidth"], transformData["maxHeight"]))
    return warped
from __future__ import division
from elasticsearch import Elasticsearch
import util, es_utility, math
import constants, search

QUERY = es_utility.readQueryFile("query_desc.51-100.short.txt")
qno = [85, 59, 56, 71, 64, 62, 93, 99, 58, 77, 54, 87, 94, 100, 89, 61, 95, 68, 57, 97, 98, 60, 80, 63, 91]

QUERY_MAP = zip([str(q) for q in qno], QUERY)

expanded_terms = util.loadJSON('prf_data.json')


def saveResults(score_fuction):
    filename = score_fuction + "_prf_results.txt"
    with open(filename, 'wb') as f:
        for n, q in QUERY_MAP:
            stemmed_terms = es_utility.getRootQuery(" ".join(q))
            new_terms = [t for t in expanded_terms[n] if t not in stemmed_terms]
            root = stemmed_terms + new_terms
            if score_fuction == "okapi":
                body = search.OKAPI_BODY(root)
            elif score_fuction == "tfidf":
                body = search.TFIDF_BODY(root)
            elif score_fuction == "bm25":
                body = search.BM25_BODY(root)
            elif score_fuction == "lm":
                body = search.LM_BODY(root)
            elif score_fuction == 'jm':
                body = search.JM_BODY(root)
            else:
Exemplo n.º 16
0
if __name__ == '__main__':

    def doRegression():
        param_list = train(data_train, labels_train)

        best_epoch, best_loss = test_bunch(data_test, labels_test, param_list)
        print "Best epoch:", best_epoch
        best_params = param_list[best_epoch]

        test(data_dev, labels_dev, best_params)
        test(data_train, labels_train, best_params, fn="train_preds")

    def doClassification():
        param_list = trainBuckets(data_train, labels_train)
        testBuckets(data_test, labels_test, param_list)

    # See util.py for explanation of loading process.
    # [0.6, 0.8] means 0.6 train, 0.2 dev, 0.2 test.
    all_data = list(loadJSON(input_file, [0.6, 0.8]))

    # Optional; spreads data out to approximate a uniform distribution.
    # mus/sigmas are needed if wanting to renormalize (see fn above).
    mus, sigmas = undoLabelNormalization(all_data)

    data_train, labels_train, data_dev, labels_dev, data_test, labels_test = all_data
    print "Train size:", data_train.shape[0]
    print "Test size:", data_test.shape[0]

    doRegression()
    # doClassification()
Exemplo n.º 17
0
import util, es_utility, glob
from elasticsearch import Elasticsearch

## For reading the documents
ADVANCED_PRE_PROCESSING = False
STREAM = False

## global values
JSON_FILE = "AP_DATA_FULL.json"
GLOBAL_JSON_FILE = "crawler.json"
ES_HOST = {"host": "localhost", "port": 9210}
##INDEX_NAME = 'ap_streamed_none'
INDEX_NAME = 'backup_vs'
TYPE_NAME = 'document'
ANALYZER_NAME = 'my_english'
##QREL_FILENAME = 'qrels.adhoc.51-100.AP89.txt'
QREL_FILENAME = 'QREL.txt'

STOP_LIST = util.getStopList("stoplist.txt")
DOC_LIST = util.getDocList('doclist.txt')
INDEX_CONSTANTS = util.loadJSON(GLOBAL_JSON_FILE)
CORPUS = glob.glob("ap89_collection/ap*")
RESULT_SIZE = 10

ES_CLIENT = Elasticsearch(hosts=[ES_HOST], timeout=180)
##print ES_CLIENT.count(index=INDEX_NAME)['count']