Exemplo n.º 1
0
def trans(src, dst, index, label_index, mode='w', sep='/'):
    lid = indexer.Indexer(label_index, mode)
    inder = indexer.Indexer(index, mode)

    #lid=indexer.Indexer(label_index,'r')
    #print(lid,mode)

    file = open(dst, 'wb')
    ln = 0
    #print(src)
    for line in open(src, encoding='utf8'):
        ln += 1
        #print(line)

        wts = [x.rpartition(sep) for x in line.strip().split(' ')]
        if sep == ' ':
            tags = ['' for x in wts]
            line = [x[-1] for x in wts]
        else:
            tags = [x[-1] for x in wts]
            line = [x[0] for x in wts]

        #print(len(wts))
        #input()
        seq = ''.join(line)
        #if (mode=='r'):
        #print(seq)
        #input()

        graph = []

        fs = [
            filter(lambda x: x >= 0, [inder(k) for k in gen_keys(seq, x)])
            for x in range(len(seq))
        ]
        for c, v in zip(_to_tags(tags, line, lid), fs):
            graph.append([0, [], c, v])
        if not graph: continue
        graph[0][0] += 1
        graph[-1][0] += 2
        for i in range(1, len(graph)):
            graph[i][1] = [i - 1]

        json_to_binary.graph_to_file(graph, file)

        if ln % 1000 == 0:
            print(ln)
            #if ln>5000:break

    file.close()
    print(len(inder))

    print('the end')
Exemplo n.º 2
0
def main():
    indexing = indexer.Indexer('database')
    file_name, query, function = entering()
    indexing.indexing_with_lines(file_name)
    searching = SearchEngine('database')

    result = None
    if function == 1:
        result = searching.multiple_tokens_search(query)
    elif function == 2:
        context_size = int(input('Enter the context size to search for:\n'))
        result = searching.search_to_sentence(query, context_size)
    elif function == 3:
        context_size = int(input('Enter the context size to search for:\n'))
        result = searching.search_to_highlight(query, context_size)
    elif function == 4:
        exit()
    else:
        print('You entered a wrong function number, try again')
        entering()

    with open('result.txt', 'w') as result_file:
        for key in result:
            result_file.write(key + ':\n')
            for el in result[key]:
                result_file.write(str(el) + '\n')
            result_file.write('\n')

    del searching
    for filename in os.listdir(os.getcwd()):
        if filename == 'database' or filename.startswith('database.'):
            os.remove(filename)
Exemplo n.º 3
0
def main():
    indexing = indexer.Indexer("database")
##    file = open('text.txt', 'w')
##    file.write('На небе. Много. Фиолетовых облачков')
##    file.close()
##    file2 = open('text2.txt', 'w')
##    file2.write('На розоватом. Небе небе много облачков маленьких. J')
##    file2.close()
##    indexing.index_with_lines('text2.txt')
##    file3 = open('text3.txt', 'w')
##    file3.write('На голубом преголубом небе небе много облачков много облачков небе. \n птичек много облачков \n звезд')
##    file3.close()
##    indexing.index_with_lines('text3.txt')
##    indexing.closeDatabase()
    #search = SearchEngine("vim")
##    tokenquery = "небе"
##    tokenquery2 = "много облачков"
##    searchresult = search.highlighted_context_window_search(tokenquery2)

    #contextsearch1 = search.lim_off_context_window_search_acc('князь Андрей', 3, 0, [[3,0],[3,0],[3,0]])
   # print(contextsearch1)
##    search.closeDatabase()

    testfile = open("text.txt", 'w')
    testfile.write("There are only fluffy kittens!")
    testfile.close()
    testfile2 = open("text2.txt", 'w')
    testfile2.write("only kittens and puppies...")
    testfile2.close()
    indexing.index_with_lines("text2.txt")
    indexing.index_with_lines("text.txt")
    testsearch = SearchEngine('database')
    # context '3,3'
    windowsdict = testsearch.several_tokens_search_with_customizable_context_acc("only kittens", 3, 3, 3, -1)
    print(windowsdict)
Exemplo n.º 4
0
def index():
    Indexer = indexer.Indexer()

    for review in loader.LoadDataSeq():
        toneScore = analyzer.GetToneAnalysis(review["reviews.text"])
        review["toneScore"] = toneScore
        Indexer.Index(_id, review)
Exemplo n.º 5
0
def main():
    index = indexer.Indexer('db')
    d = open('tgt.txt', 'w')
    d.write('this is a test required for helping. students create a test\n')
    d.write(' professor required to write a test first')
    d.close()
    index.indexing_with_lines('tgt.txt')
    t = open('ttt.txt', 'w')
    t.write('test is required. On the other hand...')
    t.close()
    index.indexing_with_lines('ttt.txt')
    del index
    engine = SearchEngine('db')
    search = engine.search_multiple('test')
    #result = engine.search_one_context('tgt.txt', Position_with_lines(11, 19, 1), 1)
    #result_multiple = engine.search_multiple_contexts(search, 2)

    #today = engine.limit_quote_search('test', 2, 0, [(1, 1), (1, 0)])

    today = engine.limit_quote_context_search('test', -2, 0, [(2, 0), (1, 0)])
    re = engine.search_extended_context('test', 2)
    print(today)
    #print(re)
    del engine
    if 'tgt.txt' in os.listdir(os.getcwd()):
        os.remove('tgt.txt')
    if 'ttt.txt' in os.listdir(os.getcwd()):
        os.remove('ttt.txt')
    for filename in os.listdir(os.getcwd()):
        if filename == 'db' or filename.startswith('db.'):
            os.remove(filename)
Exemplo n.º 6
0
    def run(self):
        self.extract_queries()
        idx = Indexer.Indexer()
        idx.run()

        print "Running Queries on index:"
        i = 0
        for query in self.queryTexts:
            idx.query(self.queryNums[i], query)
            i += 1
Exemplo n.º 7
0
def trans(src, dst, index, label_index, mode='w', sep='/', dictionary=None):
    lid = indexer.Indexer(label_index, mode)
    inder = indexer.Indexer(index, mode)
    if dictionary: dict_feature = DictFeature(dictionary)

    file = open(dst, 'wb')
    ln = 0
    for line in open(src, encoding='utf8'):
        ln += 1
        wts = [x.rpartition(sep) for x in line.strip().split(' ')]
        if sep == ' ':
            tags = ['' for x in wts]
            line = [x[-1] for x in wts]
        else:
            tags = [x[-1] for x in wts]
            line = [x[0] for x in wts]
        seq = ''.join(line)
        graph = []

        fs = [[inder(k) for k in gen_keys(seq, x)] for x in range(len(seq))]
        if dictionary: dict_feature(seq, inder, fs)
        fs = [list(filter(lambda x: x >= 0, fv)) for fv in fs]

        #print(fs)
        #input()
        for c, v in zip(_to_tags(tags, line, lid), fs):
            graph.append([0, [], c, v])
        if not graph: continue
        graph[0][0] += 1
        graph[-1][0] += 2
        for i in range(1, len(graph)):
            graph[i][1] = [i - 1]

        json_to_binary.graph_to_file(graph, file)

        if ln % 1000 == 0:
            print(ln)
            #if ln>5000:break

    file.close()
    print(len(inder))

    print('the end')
Exemplo n.º 8
0
 def setUp(self):
     index = indexer.Indexer('dbase')        
     f = open('test.txt', 'w')
     f.write('this is\ntest')
     f.close()
     t = open('tst.txt', 'w')
     t.write('test')
     t.close()        
     index.indexing_with_lines('test.txt')
     index.indexing_with_lines('tst.txt')
     del index
     self.s = SearchEngine('dbase')
Exemplo n.º 9
0
 def setUp(self):
     index = indexer.Indexer('dbase')        
     f = open('test.txt', 'w')
     f.write('this is a test required for helping students create a test\n')
     f.write(' professor required to write a test first')
     f.close()
     t = open('tst.txt', 'w')
     t.write('test is required. On the other hand...')
     t.close()        
     index.indexing_with_lines('test.txt')
     index.indexing_with_lines('tst.txt')
     del index
     self.s = SearchEngine('dbase')
Exemplo n.º 10
0
def build_index():
    corpus_path = util.get_corpus_dir_path_from_args()
    preprocessor = preprocessing.Preprocessor(corpus_path)
    doc_to_terms: list[preprocessing.DocToTerms] = preprocessor.parse()

    indexer_ob = indexer.Indexer(doc_to_terms)
    inverted_index: dict[str, indexer.Posting] = indexer_ob.inverter_index()
    doc_id_name_index: dict[int, str] = indexer_ob.doc_id_to_doc_name_index()

    tf_idf_ranker = ranker.Ranker(inverted_index, doc_id_name_index)
    _tfidf = tf_idf_ranker.tfidf()

    print('Indexing completed..saving...')
    util.save_obj(doc_id_name_index, DOC_ID_NAME_INDEX_NAME)
    util.save_obj(inverted_index, INVERTED_INDEX_FILE_NAME)
    util.save_pandas_df_as_pickle(_tfidf, TFIDF_NAME_INDEX_NAME)
    print('Saved index for quick results for future queries')
Exemplo n.º 11
0
def main():
    ##    texts = ['tolstoy1.txt', 'tolstoy2.txt', 'tolstoy3.txt', 'tolstoy4.txt']
    ##    databases = ['database' + str(i) for i in range(4)]
    ##    indexings, results = [], []
    ##    for i in range(4):
    ##        indexings.append(indexer.Indexer(databases[i]))
    ##        indexings[i].indexing_with_lines(texts[i])
    ##        results.append(SearchEngine(databases[i]).search_to_sentence(input('Введите слово')))
    ##    for result in results:
    ##        print(result)
    indexing = indexer.Indexer('database')

    indexing.indexing_with_lines('text.txt')
    searching = SearchEngine('database')
    result = searching.search_to_sentence('туманы')
    print(result)
    del searching
Exemplo n.º 12
0
def test(index,src,dst):

    inder=indexer.Indexer(index,'r')
    file=open(dst,'wb')
    for line in open(src,encoding='utf8'):
        line=line.split()
        seq=''.join(line)
        graph=[]
        fs=[filter(lambda x:x>=0,[inder(k) for k in gen_keys(seq,x)]) for x in range(len(seq))]
        for c,v in zip(_to_tags(line),fs):
            graph.append([0,[],c,v])
        if not graph:continue
        graph[0][0]+=1;
        graph[-1][0]+=2;
        for i in range(1,len(graph)):
            graph[i][1]=[i-1]
        json_to_binary.graph_to_file(graph,file)
    print('the end')
    file.close()
Exemplo n.º 13
0
 def test_scw_single(self):
     """test if the program is working correctly
     when searching for context windows extended to
     the sentence boundaries for a single word
     """
     k = open('newtest.txt', 'w')
     k.write('What is your name? My name is test.')
     k.close()
     ind = indexer.Indexer('newdb')
     ind.indexing_with_lines('newtest.txt')
     del ind
     self.k = SearchEngine('newdb')
     result = self.k.search_extended_context('test', 1)
     output = {'newtest.txt': [Context_Window([Position_with_lines(30, 34, 0)], 19, 35, 'What is your name? My name is test.')]}
     self.assertEqual(result, output)
     del self.k
     for filename in os.listdir(os.getcwd()):            
         if filename == 'newdb' or filename.startswith('newdb.'):
             os.remove(filename)        
     os.remove('newtest.txt')
Exemplo n.º 14
0
    def __init__(self,
                 ecran,
                 adventure: Adventure,
                 s: socket.socket = None,
                 p: tuple = ('127.0.0.1', 5500)):
        self.__start_at__ = 0

        self.adventure = adventure

        # self.fps_regulator = IAFPS(FPS_base)
        self.fps_regulator = ree.create_clock()
        self.continuer = 1
        self.ecran = ecran
        self.sock = s
        self.params = p
        self.renderer_manager = renderer_manager.RendererManager()
        self.show_fps = False

        # Polices
        self.police_normale = ree.load_font(POLICE_PATH, POL_NORMAL_TAILLE)
        self.police_grande = ree.load_font(POLICE_PATH, POL_GRANDE_TAILLE)
        self.police_petite = ree.load_font(POLICE_PATH, POL_PETITE_TAILLE)

        # Managers
        self.carte_mgr = carte.CartesManager(self.ecran, self.renderer_manager,
                                             self.police_normale)
        self.oth_persos_mgr = personnage.OthPersonnagesManager(
            self.ecran, self.carte_mgr)
        self.indexeur = indexer.Indexer(self.ecran, self.police_grande,
                                        self.renderer_manager)
        self.equipe_mgr = equipe_manager.EquipeManager(self.ecran,
                                                       self.police_grande,
                                                       self.indexeur,
                                                       self.renderer_manager)
        self.pc_mgr = computer_manager.ComputerManager(self.ecran,
                                                       self.police_grande,
                                                       self.renderer_manager)
        self.tab_types = tab_types.Storage()
        self.cur_combat = None
        self.menu_in_game = menu_in_game.Menu(self.ecran, self.police_grande)
        self.zones_manager = zones_attaques_manager.ZonesManager(self.indexeur)
        self.money = money_mgr.MoneyManager()
        self.gui_save_mgr = GUISauvegarde(self.ecran, self.police_grande)
        self.network_ev_listener = NetworkEventsListener(
            self.sock, self.params)
        self.chat_mgr = chat_manager.ChatManager(self.ecran,
                                                 self.police_normale,
                                                 self.network_ev_listener,
                                                 self.adventure.get_pseudo(),
                                                 RANG_NUL)
        self.mini_map = carte.CarteRenderer(self.ecran, self.police_normale,
                                            self.adventure)
        self.attaques_table = atk_sys.AttaquesTable()
        self.parametres = ParametresManager()
        self.musics_player = music_player.MusicPlayer()

        # Entités
        self.personnage = personnage.Personnage(self.carte_mgr, self.ecran,
                                                self.police_grande)

        # Contrôles
        self.controles = {}
        self.controles_joy = {}
        self.joystick = None

        self.__ctrls = {}
        self._default_dt = 1.0
        self._play_music = True
        self._play_anims = True
Exemplo n.º 15
0
class Employer:
    id_counter = indexer.Indexer()

    #初期設定
    def __init__(self):
        self.id = Employer.id_counter.gen()
        self.employees = []
        self.assignment_work_history = []
        #self.processed_work = 0.0
        self.time = 0
        self.examql = None
        self.renewql = None
        self.changeql = None

    #雇用転換
    def change_employment(self, employee):
        state = self.changeql.state(self,employee)    
        return self.changeql.action(state, employee)

    #入社試験
    def exam(self, employee):
        state = self.examql.state(self,employee)    
        return self.examql.action(state, employee)

    #契約更新
    def renew(self, employee):
        state = self.renewql.state(self,employee)
        return self.renewql.action(state, employee)
    
    #雇用転換(合格)
    def regular_employ(self, employee):
        if employee in self.employees:
            pstate = self.changeql.state(self,employee)
            action = True
            employee.change_regular()
            nstate = self.changeql.state(self,employee)
            reward = self.changeql.reward(self)
            self.changeql.update(pstate, nstate, action, reward)

    #雇用転換(不合格)
    def temporary_employ(self, employee):
        if employee in self.employees:
            pstate = self.examql.state(self,employee)
            action = False
            nstate = self.changeql.state(self,employee)
            reward = self.changeql.reward(self)
            self.changeql.update(pstate, nstate, action, reward)

    #入社試験(合格)
    def employ(self, employee, gdp_index):
        if employee not in self.employees:
            pstate = self.examql.state(self,employee)
            action = True
            employee.employed(self,gdp_index)
            self.employees.append(employee)
            nstate = self.examql.state(self,employee)
            reward = self.examql.reward(self)
            self.examql.update(pstate, nstate, action, reward)

    #入社試験(不合格)
    def reject(self, employee):
        if employee not in self.employees:
            pstate = self.examql.state(self,employee)
            action = False
            nstate = self.examql.state(self,employee)
            reward = self.examql.reward(self)
            self.examql.update(pstate, nstate, action, reward)

    #契約更新(合格)
    def keep(self, employee):
        if employee in self.employees:
            pstate = self.renewql.state(self, employee)
            action = True
            nstate = self.renewql.state(self, employee)
            reward = self.renewql.reward(self)
            self.renewql.update(pstate, nstate, action, reward)        

    #契約更新(不合格)
    def fire(self, employee):
        if employee in self.employees:
            pstate = self.renewql.state(self, employee)
            action = False
            employee.fired(self)
            self.employees.remove(employee)
            nstate = self.renewql.state(self, employee)
            reward = self.renewql.reward(self)
            self.renewql.update(pstate, nstate, action, reward)
                  
    #労働者エージェントの辞職
    def resign(self, employee):
       if employee in self.employees:
            self.employees.remove(employee)

    #雇用形態をカウント
    def count_worker_type(self, work_type):
        return len([e for e in self.employees if e.work_type == work_type])

    #正規労働者をカウント
    def count_regular(self):
        return self.count_worker_type(WORKER_TYPE.REGULAR)

    #非正規労働者をカウント
    def count_temporary(self):
        return self.count_worker_type(WORKER_TYPE.TEMPORARY)

    #労働者エージェントをカウント
    def count_employee(self):
        return len(self.employees)

    #GDPから仕事を割り振る
    def set_work(self, work):
        self.assignment_work_history.append(work)

    def assigned_work(self, index=-1):
        if index < 0:
            if abs(index) > len(self.assignment_work_history):
                return 0
        else:
            if index + 1 > len(self.assignment_work_history):
                return 0
        return self.assignment_work_history[index]

    #行った仕事量
    def processed_work(self):
        return sum([e.work() for e in self.employees])

    #残った仕事量
    def remained_work(self):
        return self.assigned_work() - self.processed_work()
    
    #時間経過
    def elapse(self, time_interval=1):
        self.time += time_interval

    #リセットする
    def clear(self):
        self.employees.clear()
        self.assignment_work = 0
        self.time = 0

    def to_s(self):
        l = [e.to_s() for e in self.employees]
        emoloyee_str = ', \n'.join(l)

        return '<EMPLOER ' + \
            'ID:' + str(self.id) + \
            ', ' + \
            'EMPLOYEES = \n' + emoloyee_str + \
            'T_TASK = ' + str(self.assignment_work) + \
            'P_TASK = ' + str(self.processed_work) + \
            '\n>'
Exemplo n.º 16
0
class Employee:
    id_counter = indexer.Indexer()

    ##########初期設定##########
    def __init__(self):
        self.id = Employee.id_counter.gen()
        self.age = WORKER_AGE.LOWER
        self.work_type = WORKER_TYPE.JOBLESS
        self.employer = None
        self.length_of_service = 0
        self.rewards = {}
        self.selection_strategy = None

##########年始処理##########

    def begin(self):
        self.elapse()
        pass

##########どの雇用者エージェント選択するかを決める##########

    def select_employers(self, employers, num=1):
        return self.selection_strategy.select(self, employers, num)

##########雇われる(全ての労働者エージェントは非正規労働者からスタートする)#########

    def employed(self, employer, gdp_index):
        if self.employer is not None and self.employer != employer:
            self.retire()
        self.employer = employer
        self.work_type = WORKER_TYPE.TEMPORARY
        self.length_of_service = 0
        self.rewards[self.employer] = 0

##########働く(労働力計算)##########

    def work(self):
        #労働力最大ピーク43歳(厚労省)
        #定年者は新卒の1.5倍の労働力
        #1人あたりのGDPは67
        # 就業年数に線形で
        # 年齢による労働効率の二次近似式(年齢によっていくら労働力が変わるか)
        # f(x) = -0.00236*x^2+0.22*x-3.12325  x:労働年齢
        #xが最低労働年齢のとき、f(x)は1
        alpha = 0.5
        beta = 0.5
        gdp_unit = 67
        #年齢効率
        f = lambda x: -0.00236 * x**2 + 0.22 * x - 3.12325
        #勤続年数効率 20 age =1, 55 age = 2
        g = lambda x: 0.029464 * x + 0.410714
        dp = alpha * f(self.age) + beta * g(self.length_of_service)
        return gdp_unit * dp
        #pass

##########給料をもらう##########

    def salary(self, money):
        if self.employer is not None:
            if self.work_type == WORKER_TYPE.REGULAR:
                self.rewards[self.employer] += money * 2
            elif self.work_type == WORKER_TYPE.TEMPORARY:
                self.rewards[self.employer] += money

##########雇用形態を変更する##########

    def change(self, work_type):
        self.work_type = work_type

##########正規雇用者へ転換する##########

    def change_regular(self):
        self.work_type = WORKER_TYPE.REGULAR

##########退職する##########

    def retire(self):
        #雇用者エージェントがいる場合
        if self.employer is not None:
            self.employer.resign(self)
            self.employer = None
            self.work_type = WORKER_TYPE.JOBLESS
            self.length_of_service = 0

##########解雇通知する##########

    def fired(self, employer):
        self.employer = None
        self.work_type = WORKER_TYPE.JOBLESS
        self.length_of_service = 0

##########時間経過(勤続年数・年齢)##########

    def elapse(self, time_interval=1):
        self.length_of_service += time_interval
        self.age += time_interval

##########年齢を確認する##########

    def is_worker_age(self):
        return (WORKER_AGE.LOWER <= self.age <= WORKER_AGE.UPPER)

##########文字列オブジェクト###########

    def to_s(self):
        return "<EMPLOEE \n" + \
            "ID:" + str(self.id) + ", " + \
            "EMPLOYER = " + (str(self.employer.id) if self.employer is not None else "None") + ", " + \
            "EMP_TYPE = " + self.work_type + \
            "\n>"

##########年末処理###########

    def end(self):
        #if the law comes into force operation
        #self.change(WORKER_TPYE.REGULAR) by the length of service over 5
        pass
Exemplo n.º 17
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import flask

import config
import indexer
import main
import search
import settings
import static

if __name__ == "__main__":
    indexer = indexer.Indexer(config.INDEX_DIR, config.DOC_DIRS)

    app = flask.Flask(__name__)
    app.secret_key = config.SECRET_KEY

    app.add_url_rule("/", view_func=main.Main.as_view("main"))
    app.add_url_rule("/search",
                     view_func=search.SearchResult.as_view("search", indexer))
    app.add_url_rule("/settings",
                     view_func=settings.SearchSettings.as_view("settings"))

    for directory in config.DOC_DIRS:
        app.add_url_rule("/{0}/<path:path>".format(directory),
                         view_func=static.DocumentView.as_view(
                             directory, directory))

    app.run("localhost", 8080, False)
Exemplo n.º 18
0
 def setUp(self):
     """create an object of Indexer class 
     """
     self.i = indexer.Indexer('dbase')
Exemplo n.º 19
0
        sys.exit(1)  #exit interpreter

    print 'Desired precision@10 for context: {}'.format(arglist[1])
    print 'Desired precision@10 for trends: {}'.format(arglist[2])

    precisionTenTargBing = float(arglist[1])  #must convert string to float
    precisionTenTargTwitter = float(arglist[2])  #must convert string to float
    #'eECeOiLBFOie0G3C03YjoHSqb1aMhEfqk8qe7Xi2YMs='
    #connect to client with key arg[1] and post a query with arg[3], query

    bingClient = bingclient.BingClient(constants.BING_ACCT_KEY)
    twitterClient = twitterclient.TwitterClient(constants.APP_KEY,
                                                constants.APP_SECRET,
                                                constants.OAUTH_TOKEN,
                                                constants.OAUTH_TOKEN_SECRET)
    indexer = indexer.Indexer()
    expandedQueryBing = ' '.join(arglist[3:])
    queryOptimizer = rocchio.RocchioOptimizeQuery(expandedQueryBing)

    firstPass = 1
    precisionAtK = 0.00
    queryWeights = {}

    #while precision at 10 is less than desired amt issue a query, obtain new precision metric, expand query, repeat
    while (precisionAtK < precisionTenTargBing):
        precisionAtK = 0.00  #reset precision each round
        #PROCESS A QUERY

        print 'Parameters'
        print '%-20s= %s' % ("Query", expandedQueryBing)
        print '%-20s= %s' % ("Target Precision", precisionTenTargBing)
import flask
from flask import request, jsonify
from flask_cors import CORS
import controller
import lookup
import indexer
import apptrace
import settings

app = flask.Flask(__name__)
cors = CORS(app, resources={r"/v1/*": {"origins": "*"}})

setting = settings.Setting()
debug = setting.debugMode
trx = apptrace.AppTrace(setting.debugMode)
indx = indexer.Indexer(trx, setting)
lp = lookup.Lookup(trx, setting)
ctrl = controller.Controller(lp, indx, trx, setting)


@app.errorhandler(404)
def page_not_found(e):
    return "<h1>404</h1><p>The resource could not be found.</p>", 404


@app.route('/v1/search', methods=['GET'])
def search():
    query_parameters = request.args
    enableTrace = False
    if 'enabletrace' in query_parameters:
        enableTrace = True
Exemplo n.º 21
0
 def setUp(self):
     self.testindexer = indexer.Indexer('database')
Exemplo n.º 22
0
    head = "".join(doc[1])
    text = "".join(doc[2])
    headline_toks = ts.stem_tokens(ts.removestops(ts.tokenizeText(head)))
    text_toks = ts.stem_tokens(ts.removestops(ts.tokenizeText(text)))
    tdoc = [doc_id, headline_toks, text_toks]
    term_docs.append(tdoc)
print("tokenization, stopping, stemming DONE.")
timestep1 = timer()
print("after " + str(timestep1 - starttime) + " seconds.")
##create token file
#with open("tokens.txt", 'w') as f:
#	for doc in term_docs:
#		f.write(str(doc[0])+"\n")
#		f.write(str(doc[1]+doc[2])+"\n")
#create index for docs -> indexer.py
index = indexer.Indexer(term_docs)

#SAVE INDEX VARIABLE for the query search
with open("indexvar.txt", 'wb') as f:
    pickle.dump(index, f, protocol=-1)

#SAVE PRINT VERSION OF INDEX for viewing
indexer.PrintIndex2Text(index, "index.txt")

print("Positional inverted index created, find print version: 'index.txt'")
timestep2 = timer()
print("after " + str(timestep2 - timestep1) + " seconds.")
##########################################
"""
Extra variable: A simple document index, so when document IDs
matching a query are found, we can also return the actual 
Exemplo n.º 23
0
 def __init__(self, index):
     self.fid = indexer.Indexer(index)
Exemplo n.º 24
0
import os
import indexer

if __name__ == '__main__':

    indexer = indexer.Indexer(index_path='./inverted_index/')

    path = '/home/tani/wikidump/'

    for filename in os.listdir(path):
        if filename.startswith('wikidump') and \
           not filename.endswith('bz2'):
            print('parsing {}...'.format(filename))
            indexer.parse_data(os.path.join(path, filename))

    indexer.finish_indexing()
Exemplo n.º 25
0
def main():
  index_dict = {}

  try:
    pkl_file = open(indexer.Indexer.filename, 'rb')
    index_dict = pickle.load(pkl_file)
    pkl_file.close()
  except IOError:
    print "Pickle file not found."

  indx = indexer.Indexer(index_dict)
  db_manager = dbmanager.dbmanager(DB_NAME)
  logging.basicConfig(filename = LOG_NAME, 
		      format='%(asctime)s:%(levelname)s:%(message)s',
		      filemode='w', level=logging.WARN)
  frontier = ['http://www.theonion.com','http://www.reddit.com','https://en.wikipedia.org/wiki/Satire']
  visited = {}
  domains = {}
  db_visited = db_manager.get_visited()
  db_frontier = db_manager.get_frontier()

  frontier += db_frontier
  #shuffle(frontier)

  for url in db_visited:
    print "Already visited: " + url
    visited[url] = 1

  current_threads = 0
  threads = []
  data = []
  t_urls = []
  
  for url in frontier:
    if visited.get(url, None):
      logging.info("Not requesting " + url + " because it has already been visited.")
      continue

    if domains.get(get_domain(url), 0) >= MAX_REQ_PER_DOMAIN:
      logging.info("Not requesting " + url + " because max requests per domain has been exceeded.")
      continue

    if is_blacklisted(url):
      logging.info("Not requesting " + url + " because it is blacklisted.")
      continue

    if(current_threads < MAX_THREADS):
      logging.info("Requesting " + url)
      print "Requesting " + url + " as t=" + str(current_threads)
      visited[url] = 1

      urldom = get_domain(url)
      if urldom in domains:
	domains[urldom] += 1
      else:
	domains[urldom] = 1

      d = []
      data.append(d)
      t_urls.append(url)
      t = Requester(url, TIME_LIMIT, d, MAX_SIZE_BYTES)
      t.start()
      threads.append(t)
      current_threads += 1

    if((current_threads >= MAX_THREADS) or (url == frontier[-1])):
      current_threads = 0
      for t in threads:
	t.join()

      for i in range(len(t_urls)):
	htmldata = ""
	if data[i]:
	  htmldata = data[i][0]
	db_manager.insert_visited(t_urls[i], len(htmldata))

	page_urls = list(set(get_urls(t_urls[i], htmldata)))
	indx.index_page(t_urls[i], htmldata)
	db_manager.insert_frontier(page_urls, t_urls[i])
	frontier += page_urls

      output_pkl = open(indexer.Indexer.filename, 'wb')
      pickle.dump(indx.index, output_pkl)
      output_pkl.close()

      threads = []
      data = []
      t_urls = []

  db_manager.close()
Exemplo n.º 26
0
import time
import psutil
import indexer as Indexer
import document
import sys
import gc
import os
import merge
import random
import calculations

# Document id
docid = 0
# Collection of Documents, for future purposes
#collection = []						#Collection of existing documents
indexer = Indexer.Indexer()  #Indexer of tokenizer

# Begin the timer.
start = time.time()


def memory_usage_psutil():
    # return the memory usage in percentage like top
    process = psutil.Process(os.getpid())
    mem = process.memory_percent()
    return mem


#Now loads all files from input folder
def getFiles(path):
    files = os.listdir(path)
Exemplo n.º 27
0
    #path = "/Users/shireenhsu/Desktop/121_Assignment3/DEV"
    #path = "/Users/jason/Desktop/ANALYST"

    #Actually reading the JSON and merging the files into one output.txt

    path = input("Enter Path Name: ")

    files = readFiles(path)
    doc_id = DocID()
    manager = IndexerManager(doc_id, files)
    get_doc_lock = threading.Lock()  #locks for multithreading
    simhash_lock = threading.Lock()
    indexers = [
        indexer.Indexer(
            "partial(thread" + str(i) + ").txt",
            manager,  #creates and instntiates indexers based on THREADS constant
            get_doc_lock,
            simhash_lock,
            i) for i in range(1, THREADS + 1)
    ]
    for indexer in indexers:
        indexer.start()  #starts all indexer threads
    for indexer in indexers:
        indexer.join()  #waits for all indexer threads
    mergeFiles(
        manager.partial_indexes
    )  #merges the partial indexes written by indexers to the manager
    doc_id.write_doc_id(
        "docID.json")  #stores the docID dictionary for use later
    indexIndex("output.txt", "indexindex.json"
               )  #creates an index of the index for optimized search times