Exemplo n.º 1
0
def main():

  C = color.bcolors()

  print C.HEADER + "=========== Instantiate MapReduceFramework ===========" + C.ENDC  
  mrf = framework.MapReduceFramework()
  mrf.getWorkerInfo('prework_workers.json')
  
  #print C.HEADER + "=========== Start Local Indexing ===========" + C.ENDC
  # localIndexer.ReviewIndexing()

  #mrf.mapReduceFS('tintest', 'mapreduce/test/fish_jobs', 'mapreduce.test.wordcount.mapper', 1, 'mapreduce.test.wordcount.reducer', 'mapreduce/test/fish_jobs/out')
  #tornado.ioloop.IOLoop.instance().start()

  print C.HEADER + "=========== Start Indexing Movies ===========" + C.ENDC
  print C.OKBLUE + "Start idfBuilder_test" + C.ENDC
  mrf.mapReduceFS('idfBuilder_test', 'mapreduce/input_movie_test', 'src.idfBuilder.mapper', 1, 'src.idfBuilder.reducer', 'constants/idf')
  tornado.ioloop.IOLoop.instance().start()
  jobTable = DisTable(tableName='idfBuilder_test')
  print type(jobTable)
  print jobTable.fetch_all()
  print 'works!!'

  print C.OKBLUE + "Start idfBuilder" + C.ENDC
  mrf.mapReduceFS('idfBuilder', 'constants/input_movie_test', 'src.idfBuilder.mapper', 1, 'src.idfBuilder.reducer', 'constants/idf')
  tornado.ioloop.IOLoop.instance().start()
  print C.OKBLUE + "Start invertedIndexer" + C.ENDC
  mrf.mapReduceFS('invertedIndexer', 'constants/input_movie', 'src.invertedIndexer.mapper', 3, 'src.invertedIndexer.reducer', 'constants/invertedIndex')  
  tornado.ioloop.IOLoop.instance().start()
  print C.OKBLUE + "Start documentStore" + C.ENDC
  mrf.mapReduceFS('documentStore', 'constants/input_movie', 'src.documentStore.mapper', 3, 'src.documentStore.reducer', 'constants/documentStore')
  tornado.ioloop.IOLoop.instance().start()

  print C.HEADER + "=========== Start Indexing Genre ===========" + C.ENDC
  print C.OKBLUE + "Start genreIndexer" + C.ENDC
  mrf.mapReduceFS('genreIndexer', 'constants/input_movie', 'src.genreIndexer.mapper', 1, 'src.genreIndexer.reducer', 'constants/genreIndexer')
  tornado.ioloop.IOLoop.instance().start()
  
  print C.HEADER + "=========== Start Indexing Reviews ===========" + C.ENDC
  print C.OKBLUE + "Start movieIndexer" + C.ENDC
  mrf.mapReduceFS('movieIndexer', 'constants/input_review', 'src.movieIndexer.mapper', 3, 'src.movieIndexer.reducer', 'constants/movieIndexer')
  tornado.ioloop.IOLoop.instance().start()
  print C.OKBLUE + "Start reviewIndexer" + C.ENDC
  mrf.mapReduceFS('movieIndexer', 'constants/input_review', 'src.reviewIndexer.mapper', 3, 'src.reviewIndexer.reducer', 'constants/reviewIndexer')
  tornado.ioloop.IOLoop.instance().start()

  print C.HEADER + "=========== Start Classification Training ===========" + C.ENDC
  worker_address = 'prework_workers.json'
  #raw_data = 'constants/Genre_dict'
  raw_data = 'constants/Genre_dictII_9500'
  training_set = 'constants/training_set.p'
  weights_dir = 'constants/classification_weights'
  tn = trainer.Trainer()
  tn.setWorkerInfo(worker_address)
  genres = tn.processRawData(raw_data, training_set)
  tn.setTraningParameter(0.9, 500, 0.01)
  tn.train(training_set, genres, weights_dir)
  tornado.ioloop.IOLoop.instance().start()
  tn.generateWeightTable(weights_dir)
Exemplo n.º 2
0
def main():

    C = color.bcolors()

    print C.HEADER + "=========== Instantiate MapReduceFramework ===========" + C.ENDC
    mrf = framework.MapReduceFramework()
    mrf.getWorkerInfo('mapreduce_workers.json')

    #print C.HEADER + "=========== Start Local Indexing ===========" + C.ENDC
    # localIndexer.ReviewIndexing()

    print C.HEADER + "=========== Start Indexing Movies ===========" + C.ENDC
    print C.OKBLUE + "Start invertedIndexer" + C.ENDC
    mrf.mapReduce('constants/input_movie', 'src.invertedIndexer.mapper', 3,
                  'src.invertedIndexer.reducer', 'constants/invertedIndex')
    tornado.ioloop.IOLoop.instance().start()
    print C.OKBLUE + "Start idfBuilder" + C.ENDC
    mrf.mapReduce('constants/input_movie', 'src.idfBuilder.mapper', 1,
                  'src.idfBuilder.reducer', 'constants/idf')
    tornado.ioloop.IOLoop.instance().start()
    print C.OKBLUE + "Start documentStore" + C.ENDC
    mrf.mapReduce('constants/input_movie', 'src.documentStore.mapper', 3,
                  'src.documentStore.reducer', 'constants/documentStore')
    tornado.ioloop.IOLoop.instance().start()

    print C.HEADER + "=========== Start Indexing Genre ===========" + C.ENDC
    print C.OKBLUE + "Start genreIndexer" + C.ENDC
    mrf.mapReduce('constants/input_movie', 'src.genreIndexer.mapper', 1,
                  'src.genreIndexer.reducer', 'constants/genreIndexer')
    tornado.ioloop.IOLoop.instance().start()

    print C.HEADER + "=========== Start Indexing Reviews ===========" + C.ENDC
    print C.OKBLUE + "Start movieIndexer" + C.ENDC
    mrf.mapReduce('constants/input_review', 'src.movieIndexer.mapper', 3,
                  'src.movieIndexer.reducer', 'constants/movieIndexer')
    tornado.ioloop.IOLoop.instance().start()
    print C.OKBLUE + "Start reviewIndexer" + C.ENDC
    mrf.mapReduce('constants/input_review', 'src.reviewIndexer.mapper', 3,
                  'src.reviewIndexer.reducer', 'constants/reviewIndexer')
    tornado.ioloop.IOLoop.instance().start()

    print C.HEADER + "=========== Start Classification Training ===========" + C.ENDC
    worker_address = 'classification_workers.json'
    raw_data = 'constants/Genre_dict'
    raw_data = 'constants/Genre_dictII_9500'
    training_set = 'constants/training_set.p'
    weights_dir = 'constants/classification_weights'
    tn = trainer.Trainer()
    tn.setWorkerInfo(worker_address)
    genres = tn.processRawData(raw_data, training_set)
    tn.setTraningParameter(0.9, 500, 0.01)
    tn.train(training_set, genres, weights_dir)
    tornado.ioloop.IOLoop.instance().start()
    tn.generateWeightTable(weights_dir)
Exemplo n.º 3
0
def main():

  C = color.bcolors()

  print C.HEADER + "=========== Instantiate MapReduceFramework ===========" + C.ENDC  
  mrf = framework.MapReduceFramework()
  mrf.getWorkerInfo('mapreduce_workers.json')
  
  #print C.HEADER + "=========== Start Local Indexing ===========" + C.ENDC
  # localIndexer.ReviewIndexing()
  
  print C.HEADER + "=========== Start Indexing Movies ===========" + C.ENDC
  print C.OKBLUE + "Start invertedIndexer" + C.ENDC
  mrf.mapReduce('constants/input_movie', 'src.invertedIndexer.mapper', 3, 'src.invertedIndexer.reducer', 'constants/invertedIndex')  
  tornado.ioloop.IOLoop.instance().start()
  print C.OKBLUE + "Start idfBuilder" + C.ENDC
  mrf.mapReduce('constants/input_movie', 'src.idfBuilder.mapper', 1, 'src.idfBuilder.reducer', 'constants/idf')
  tornado.ioloop.IOLoop.instance().start()
  print C.OKBLUE + "Start documentStore" + C.ENDC
  mrf.mapReduce('constants/input_movie', 'src.documentStore.mapper', 3, 'src.documentStore.reducer', 'constants/documentStore')
  tornado.ioloop.IOLoop.instance().start()

  print C.HEADER + "=========== Start Indexing Genre ===========" + C.ENDC
  print C.OKBLUE + "Start genreIndexer" + C.ENDC
  mrf.mapReduce('constants/input_movie', 'src.genreIndexer.mapper', 1, 'src.genreIndexer.reducer', 'constants/genreIndexer')
  tornado.ioloop.IOLoop.instance().start()
  
  print C.HEADER + "=========== Start Indexing Reviews ===========" + C.ENDC
  print C.OKBLUE + "Start movieIndexer" + C.ENDC
  mrf.mapReduce('constants/input_review', 'src.movieIndexer.mapper', 3, 'src.movieIndexer.reducer', 'constants/movieIndexer')
  tornado.ioloop.IOLoop.instance().start()
  print C.OKBLUE + "Start reviewIndexer" + C.ENDC
  mrf.mapReduce('constants/input_review', 'src.reviewIndexer.mapper', 3, 'src.reviewIndexer.reducer', 'constants/reviewIndexer')
  tornado.ioloop.IOLoop.instance().start()

  print C.HEADER + "=========== Start Classification Training ===========" + C.ENDC
  worker_address = 'classification_workers.json'
  raw_data = 'constants/Genre_dict'
  raw_data = 'constants/Genre_dictII_9500'
  training_set = 'constants/training_set.p'
  weights_dir = 'constants/classification_weights'
  tn = trainer.Trainer()
  tn.setWorkerInfo(worker_address)
  genres = tn.processRawData(raw_data, training_set)
  tn.setTraningParameter(0.9, 500, 0.01)
  tn.train(training_set, genres, weights_dir)
  tornado.ioloop.IOLoop.instance().start()
  tn.generateWeightTable(weights_dir)
Exemplo n.º 4
0
import urllib
from nltk.tokenize import RegexpTokenizer

from tornado.httpclient import AsyncHTTPClient
from tornado import gen
from tornado.options import define, options

from operator import mul

invertedIndex = None
tokenizer = None
IDF_Index = None


from src import color
bcolors = color.bcolors()

def Tosnippet(text, keywords, extend):
    returnText= '...'    
        
    for keyword in keywords:        
        loc= 0
        tmp= 0
        loc = text.lower().find(keyword.lower(), loc)
        toReplace = text[int(loc):int(loc)+int(len(keyword))]
        text= text.replace(toReplace, '<strong>{}</strong>'.format(toReplace)) + "..."
        loc= 0
        tmp= 0
        while (loc<len(text) and loc!=-1):
            tmp = loc        
            loc = text.lower().find(keyword.lower(), loc)            
Exemplo n.º 5
0
def main():
  from recommendation import recom_worker, recom_front
  from searchEngine.backend import back as searchEng_worker
  from searchEngine.frontend import front as searchEng_front
  from classification.backend import online as classifier
  # from recommendation import searchEng_worker, searchEng_front  
  import mapreduce.framework as framework
  from src import color  
  # from src import tomatoCrawler as TC
  C = color.bcolors()

  global masterServer, MovieServer, ReviewServer, IdxServer, DocServer, Baseport
  
  print C.HEADER + "=========== Start Crawling ===========" + C.ENDC
  # TC.main2Genre()

  print C.HEADER + "=========== Find Available Ports ===========" + C.ENDC
  getPorts()

  print C.OKBLUE + "SuperFront:\t" + str(SuperFront) + C.ENDC
  print C.OKBLUE + "masterServer:\t" + str(masterServer) + C.ENDC
  print C.OKBLUE + "MovieServer:\t" + str(MovieServer) + C.ENDC
  print C.OKBLUE + "ReviewServer:\t" + str(ReviewServer) + C.ENDC
  print C.OKBLUE + "IdxServer:\t" + str(IdxServer) + C.ENDC
  print C.OKBLUE + "DocServer:\t" + str(DocServer) + C.ENDC
  print C.OKBLUE + "ClassifierServer:\t" + str(ClassifierServer) + C.ENDC
  

  print C.HEADER + "=========== Fire Up All Servers ===========" + C.ENDC
  uid = fork_processes(NumMaster+NumMovie+NumReview+NumIdx+NumDoc)
  
  if uid == 0:
    sockets = bind_sockets(masterServer[uid].split(':')[-1])
    myfront = recom_front.FrontEndApp(MovieServer, ReviewServer)
    server  = myfront.app
  elif uid ==1:
    sockets = bind_sockets(masterServer[uid].split(':')[-1])    
    myfront = searchEng_front.FrontEndApp(IdxServer, DocServer)
    server  = myfront.app
  elif uid ==2:    
    sockets = bind_sockets(masterServer[uid].split(':')[-1])
    myClasify = classifier.Application(([(r"/predict?", classifier.PredictionHandler)]))
    myClasify.setGenres("./constants/classification_weights/genres.p")
    myClasify.setWeights("./constants/classification_weights/big_weight.p")
    server  = tornado.httpserver.HTTPServer(myClasify )        
    
  elif uid < NumMaster + NumMovie:
    myIdx = uid - NumMaster
    sockets = bind_sockets(MovieServer[myIdx].split(':')[-1])    
    myback_movie = recom_worker.RecommApp('MovieServer', myIdx, MovieServer[myIdx].split(':')[-1])
    server  = myback_movie.app
  elif uid < NumMaster + NumMovie + NumReview:
    myIdx = uid - NumMovie - NumMaster
    sockets = bind_sockets(ReviewServer[myIdx].split(':')[-1])
    myback_review = recom_worker.RecommApp('ReviewServer', myIdx, ReviewServer[myIdx].split(':')[-1])
    server  = myback_review.app
  elif uid < NumMaster + NumMovie + NumReview + NumIdx:
      myIdx = uid-NumMovie-NumReview-NumMaster
      sockets = bind_sockets(IdxServer[myIdx].split(':')[-1])    
      myback_idx = searchEng_worker.BackEndApp('IndexServer', myIdx, IdxServer[myIdx].split(':')[-1])
      server  = myback_idx.app
  elif uid < NumMaster + NumMovie + NumReview + NumIdx + NumDoc:
      myIdx = uid-NumMovie-NumReview-NumIdx-NumMaster
      sockets = bind_sockets(DocServer[myIdx].split(':')[-1])    
      myback_doc = searchEng_worker.BackEndApp('DocServer', myIdx, DocServer[myIdx].split(':')[-1])
      server  = myback_doc.app  

  
  server.add_sockets(sockets)
  tornado.ioloop.IOLoop.instance().start()
Exemplo n.º 6
0
import tornado.httpserver
import tornado.ioloop
import tornado.web
import hashlib
import socket
import getpass

import json, pickle

from tornado.httpclient import AsyncHTTPClient
from tornado import gen
from tornado.options import define, options

from src import color
bcolors = color.bcolors()

# ## collecting four available ports
ports = []
ports_index = []
ports_Doc = []
genere_dict = {}


def remove_duplicates(mylist):
    output = []
    seen = set()
    for (movieID, value) in mylist:
        # If value has not been encountered yet,
        # ... add it to both list and set.
        if movieID not in seen:
            output.append((movieID, value))