def inserta(self, event): # wxGlade: mainFrame.<event_handler> ok = True doc = self.txName.GetValue() path = self.txPath.GetValue() if len(doc) == 0: wx.MessageBox('Debes introducir el nombre del documento', 'Info', wx.OK | wx.ICON_INFORMATION) ok = False elif len(path) == 0: path = os.getcwd() + '/Docs' if ok: index = indexer() index.registerNewDocument(doc, path) wx.MessageBox('Documento registrado con éxito', 'Info', wx.OK | wx.ICON_INFORMATION) index = indexer() docs = index.getDocuments() self.comboDocumentos.AppendItems(docs) event.Skip()
def main(args): crawler.crawler(['http://stankin.ru/'], os.path.join(os.getcwd(), 'data'), depth=5) indexer.indexer(os.path.join(os.getcwd(), 'data', 'downloaded.json'), os.path.join(os.getcwd(), 'data2')) return 0
def __init__(self, parent, id, title): wx.Frame.__init__(self, parent, id, title, size=(450, 200), style=wx.NO_BORDER) #load indexer self.dex = indexer.indexer() #Hotkey Setup self.registerHotKeys() #bind window change self.Bind(wx.EVT_ACTIVATE, self.handleLostFocus, id=200) #window contents self.titleText = wx.StaticText(self, 0, 'Atom Launcher', style=wx.ALIGN_CENTRE) self.titleFont = wx.Font(24, wx.DEFAULT, wx.NORMAL, wx.NORMAL) self.titleText.SetFont(self.titleFont) self.commandBox = wx.TextCtrl(self, 1, '', size=(450, 20), style=wx.TE_PROCESS_ENTER | wx.TE_PROCESS_TAB) self.suggestionBox = suggestionBox(self, pos=(0, 60), size=(450, 140), style=wx.html.HW_SCROLLBAR_NEVER) #bind textevents self.commandBox.Bind(wx.EVT_KEY_DOWN, self.onKeyDown) self.commandBox.Bind(wx.EVT_KEY_UP, self.onKeyUp) #sizers1 self.vbox = wx.BoxSizer(wx.VERTICAL) self.vbox.Add(self.titleText, flag=wx.ALIGN_CENTER) self.vbox.Add(self.commandBox, flag=wx.ALIGN_CENTER) self.SetSizer(self.vbox) self.Layout() self.Center()
def start_from_terminal(app): parser = optparse.OptionParser() parser.add_option("-p", "--port", help="which port to serve content on", type="int", default=port) opts, args = parser.parse_args() net_args = { "model_def_file": model_def_file, "pretrained_model_file": pretrained_model_file, "gpu_mode": gpu_mode, "device_id": 1, "image_dim": image_dim, "raw_scale": raw_scale, } # Initialize classifier app.agent = agent(**net_args) logging.info("Initialize vision model done") # warm start by forward for allocation app.agent.net.forward() logging.info("Net forward done") app.indexer = indexer() for category_id in CATEGORY_NAME: app.indexer.load(category_id, DATABASE_FILENAME % category_id) logging.info("Loading indexer for {}".format(category_id)) logging.info("Initialize indexer done") # app.indexer.load(DATABASE_FILE) start_tornado(app, opts.port)
def start_from_terminal(app): parser = optparse.OptionParser() parser.add_option('-p', '--port', help="which port to serve content on", type='int', default=port) opts, args = parser.parse_args() net_args = { 'model_def_file': model_def_file, 'pretrained_model_file': pretrained_model_file, 'gpu_mode': gpu_mode, 'device_id': 1, 'image_dim': image_dim, 'raw_scale': raw_scale, } # Initialize classifier app.agent = agent(**net_args) logging.info('Initialize vision model done') # warm start by forward for allocation app.agent.net.forward() logging.info('Net forward done') app.indexer = indexer() for category_id in CATEGORY_NAME: app.indexer.load(category_id, DATABASE_FILENAME % category_id) logging.info('Loading indexer for {}'.format(category_id)) logging.info('Initialize indexer done') #app.indexer.load(DATABASE_FILE) start_tornado(app, opts.port)
def crawler(): path = './world wide web/' world_wide_web = indexer('./world wide web') matriz_de_transicion = [[0 for j in range(len(world_wide_web))] for page in world_wide_web.keys()] for key, value in world_wide_web.items(): pagina = open(path + "www." + value + '.com.html', 'r') pag = open(path + "www." + value + '.com.html', 'r') links = 0 for linea in pagina: if re.findall('www.([a-zA-Z]+).com', linea): links += 1 for linea in pag: if re.findall('www.([a-zA-Z]+).com', linea): for encontrado in re.finditer('www.([a-zA-Z]+).com', linea): sub_cadena = list(encontrado.span()) direccion = (linea[sub_cadena[0]:sub_cadena[1]])[4:-4] lookup = { value: key for key, value in world_wide_web.items() } matriz_de_transicion[lookup[direccion]][key] = 1 / links #for row in matriz_de_transicion: # print("\n") # for col in row: # print("| {:.2f} |\t ".format(col), end="") #print("") return matriz_de_transicion, world_wide_web
def __init__(self): self.index = indexer.indexer() self.database = db_communication.db_com() self.base_dir = "" self.store_me = dict() self.documentCount = 0 # Count the documents you crawled self.chunk = chunksys.chunksys() self.chunkstore = list()
def main(options): # collecting vocab logging.info("start collecting vocabulary") training_corpus = open(options.training_file) indexed_ngrams = [] predictions = [] vocab = indexer() vocab.add("</s>") # end = 0 vocab.add("<s>") # start = 1 unigram_count = Counter() sent_count = 0 for sentence in training_corpus: tokens = ["<s>"] * (options.n_gram - 1) tokens.extend(sentence.strip().split(' ')) indexed_sentence = [] for token in tokens: ix = vocab.getIndex(token) indexed_sentence.append(ix) if token != "<s>": count = unigram_count.get(ix, 0) unigram_count[ix] = count + 1 # think of a sentence with length 1 and we are extracting bigrams: # after the sentence is augmented with extra "<s>" at the beginning (now has length 2), # we want to extract 1 bigrams: [<s>, w0] (note that we don't want [w0, </s>]) # that's why we add 1 here. for start in range(len(indexed_sentence) - options.n_gram + 1): indexed_ngrams.append(indexed_sentence[start: start + options.n_gram]) if start + options.n_gram < len(indexed_sentence): predictions.append(indexed_sentence[start + options.n_gram]) else: eix = vocab.indexOf("</s>") predictions.append(eix) count = unigram_count.get(eix, 0) unigram_count[eix] = count + 1 sent_count += 1 unigram_count[vocab.getIndex("<s>")] = sent_count training_corpus.close() total_unigram_count = floatX(sum(unigram_count.values())) unigram_dist = [floatX(0.0)] * len(unigram_count) for key in unigram_count.keys(): unigram_dist[key] = floatX(unigram_count[key] / total_unigram_count) unigram_count.clear() # save some memory... unigram_dist = np.array(unigram_dist, dtype=floatX) logging.info("vocabulary collection finished") # training logging.info("start training with n-gram size {0}, vocab size {1}, learning rate {2}, " .format(options.n_gram, vocab.size(), options.learning_rate) + "word dimension {0}, hidden dimension 1 {1}, hidden dimension 2 {2}, noise sample size {3}" .format(options.word_dim, options.hidden_dim1, options.hidden_dim2, options.noise_sample_size)) net = nplm(options.n_gram, vocab.size(), options.word_dim, options.hidden_dim1, options.hidden_dim2, options.noise_sample_size, options.batch_size, unigram_dist) for epoch in range(options.max_epoch): sgd(indexed_ngrams, predictions, net, options, epoch, unigram_dist) logging.info("training finished")
def __init__(self, *args, **kwds): # begin wxGlade: mainFrame.__init__ kwds["style"] = wx.DEFAULT_FRAME_STYLE wx.Frame.__init__(self, *args, **kwds) self.relleno1 = wx.Panel(self, -1) self.relleno2 = wx.Panel(self, -1) self.relleno3 = wx.Panel(self, -1) self.relleno4 = wx.Panel(self, -1) self.rellenoPregIzq = wx.Panel(self, -1) self.txPregunta = wx.TextCtrl(self, -1, "") self.btBuscar = wx.Button(self, -1, "Buscar") self.rellenoBotonDer = wx.Panel(self, -1) self.rellenoBajo1 = wx.Panel(self, -1) self.rellenoBajo2 = wx.Panel(self, -1) self.btXML = wx.Button(self, -1, "Ver XML") self.rellenoBajo4 = wx.Panel(self, -1) self.rellenoIzqVer = wx.Panel(self, -1) self.lbVisualiza = wx.StaticText(self, -1, "Selecciona un documento para visualizarlo.") self.rellenoVerDer1 = wx.Panel(self, -1) self.rellenoVerDer2 = wx.Panel(self, -1) self.rellenoIzqTxVer = wx.Panel(self, -1) self.comboDocumentos = wx.ComboBox(self, -1, choices=[], style=wx.CB_DROPDOWN|wx.CB_READONLY) self.btVer = wx.Button(self, -1, "Ver") self.rellenoDerTxVer = wx.Panel(self, -1) self.rellenoBajoVer1 = wx.Panel(self, -1) self.lbNom = wx.StaticText(self, -1, " Nombre.") self.lbPath = wx.StaticText(self, -1, " Path del documento.") self.btSimilares = wx.Button(self, -1, "Similares") self.rellenoBajoVer4 = wx.Panel(self, -1) self.rellenoIzqInsert = wx.Panel(self, -1) self.txName = wx.TextCtrl(self, -1, "") self.txPath = wx.TextCtrl(self, -1, "") self.btInsetar = wx.Button(self, -1, "Insertar") self.rellenoDerInsertar = wx.Panel(self, -1) self.__set_properties() self.__do_layout() self.Bind(wx.EVT_BUTTON, self.busca, self.btBuscar) self.Bind(wx.EVT_BUTTON, self.muestraXML, self.btXML) self.Bind(wx.EVT_TEXT, self.getIden, self.comboDocumentos) self.Bind(wx.EVT_BUTTON, self.visualiza, self.btVer) self.Bind(wx.EVT_BUTTON, self.similares, self.btSimilares) self.Bind(wx.EVT_BUTTON, self.inserta, self.btInsetar) # end wxGlade index = indexer() docs = index.getDocuments() self.comboDocumentos.AppendItems(docs) self.comboDocumentos.SetValue(docs[0]) self.idDoc = 1
def __init__(self, port, net_args, oversample, category_no, max_num_items, database_filename): self.net_args = net_args self.database_filename = database_filename # Initialize classifier app.oversample = oversample app.agent = agent(**self.net_args) logging.info('Initialize vision model done') app.agent.net.forward() logging.info('Net forward done') # Initialize indexer app.indexer = indexer(category_no, max_num_items) app.indexer.load_category(database_filename) logging.info('Initialize indexer done') # get parser_utils app.parser_utils = parser_utils() app.korean_url_handler = korean_url_handler() # start web server web_server.__init__(self, app, port)
def __buildQuestion(self, conn, question, d, emptyVector): empty = [''] questionVector = copy(emptyVector) index = indexer() stopList = index.getStopList() weightList = buscador.__parseQuestion(self, question) wordSet = set(weightList.keys()) wordSet = wordSet - stopList wordSet = wordSet - set(empty) dicFdj = buscador.__getFdjs(self, conn) for word in wordSet: questionVector[word] = weightList[word] return wordSet, questionVector, dicFdj
def search(): search_bool = input( "Would you like to compile the search indexer? (\'y\'/\'n\')") if (search_bool == "y"): rootDir = 'WEBPAGES/WEBPAGES_RAW' the_indexer = indexer.indexer(rootDir) the_indexer.create_index() infile = open("dictionary_file.txt", "r") the_indexer = json.load(infile) print(len(the_indexer)) special_char_table = str.maketrans(string.punctuation, " " * len(string.punctuation)) while (True): search_input = input("Search: ") tokens = word_tokenize(search_input) LS = LancasterStemmer() set_of_posts = [] if (len(tokens) > 1): for token in tokens: for split_token in token.translate(special_char_table).split(): stem_input = LS.stem(split_token) set_of_posts.append(the_indexer[stem_input]) inter_set = get_intersect(set_of_posts) if inter_set: for posts in inter_set: print(posts[-1]) else: print("No results") else: stem_input = LS.stem(tokens[0]) if the_indexer[stem_input]: for posts in the_indexer[stem_input]: print(posts[-1]) else: print("No results") infile.close()
from gevent import Timeout from gevent import monkey monkey.patch_all() from lynx import lynx import spliter from indexer import indexer from sorter import sorter from ander import ander if __name__ == '__main__': base_urls = ['http://www.google.com', 'http://www.wikipedia.org'] print '*** 0. mkdir dir to store the html files ***' path_init(base_urls) print '*** 1. tinysearch spider starts ***' jobs = [ gevent.spawn(fetch, base_url) for base_url in base_urls ] gevent.joinall(jobs) print '*** 2. lynx html in order to get the content ***' lynx('html') print '*** 3. load spliter dict ***' dic = spliter.init() print '*** 4. index content and return doc_id ***' print "*** 5. doc_id is %s ***" % ( indexer(dic, './test/', 'test.html') ) word_list = [u'百科'] print '*** 6. get common doc_id list ***' doc_id_list = ander(word_list) print doc_id_list print '*** 7. sort the result ***' print sorter(word_list, doc_id_list)
import searcher import data_load import indexer import quotes indexer.indexer() searcher.searcher() data_load.traverser() d = indexer.indexer("raw_data.pickle") searcher.search(d)
def batch_process(): catalog_data = product_information_management() index, bow = indexer(catalog_data) return index, bow
__author__ = 'rogersjeffrey' """ This program constructs the index by reading from the test corpus files this accepts the path to the index file as an argument """ from sys import argv import indexutils import indexer print "Enter the path of the training corpus" file_path=raw_input("Corpus Files Path:") index_start_time=indexutils.gettime() print "Starting Indexing......." indexer=indexer.indexer(file_path) indexer.populate_index_hash() indexer.dump_index_hash() print "Indexing Ended" index_end_time=indexutils.gettime() print "Time taken to build the index: %f seconds" %(index_end_time-index_start_time) print "Index Stats:" indexer.get_index_stats()
def test_bar(self): s = 'barfoobazbitbyte' words = ["cat", "dog"] self.assertEqual(set(), indexer(string=s, words=words))
import sys, re import indexer db = indexer.indexer() DEBUG = "-debug" in sys.argv # apps = db.load() apps = ["1", "apple", "apple-II", "app", "Apple-III", "app4", "app5", "app6", "app7", "app8", "app9", "app10"] # searches the existing list of items def search(s): results = [] if DEBUG: print ('INFO: searching for "' + s + '" in known apps.') resultsbox.delete(0, END) # crappy linear search for i in apps: # regex match result = re.search(r"(.*)" + s.lower() + "(.*?)", i.lower()) # if result and (hardcoded limit) is <= 5 if result and len(results) < 10: results.append(i) resultsbox.insert(END, i) # gui stuff from Tkinter import * master = Tk() master.title("Legwork Launcher")
import indexer import tokenizer #import timeweighted_postdata as postdata import postdata # Can use by importing this file and using query.getScore("Some title of a post") idx = indexer.indexer() ct = 0 for i, p in postdata.posts.items(): if "title" in p: # some posts don't have titles (means they were deleted/some other error happened to them) title = p["title"].encode('ascii', 'ignore') selftext = p["selftext"].encode('ascii', 'ignore') tokens = tokenizer.tokenize(title) idx.addDocument(p["id"], tokens) ct += 1 print("Indexed {0} documents".format(ct)) idx.makeDocumentVectors() def getScore(newtitle): query = tokenizer.tokenize(newtitle) res = idx.queryVector(query, 1) #print("{0} results.".format(len(res))) # Take average of (upvotes-downvotes) weighted by similarity score ^ 2 # but only for posts with simscore > max(simscore)/2
'/home/taey16/storage/product/11st_julia/demo_{}.txt.wrap_size0.oversampleFalse.pickle'.format( category_no[current_category]) if __name__ == '__main__': print 'Start to indexing for {}'.format(INPUT_FILENAME) print 'output will be saved in {}'.format(OUTPUT_FILENAME) #import pdb; pdb.set_trace() meta_filename = '{}/{}'.format(DATASET_ROOT, INPUT_FILENAME) parser = parser_utils() input = parser.parse(meta_filename) agent = agent(**net_args) agent.net.forward() indexer = indexer(category_no, max_num_items) item_counter = 0 for item in input: try: prd_no = item['__prd_no__'] fname = \ '/userdata2/index_11st_20151020/october_11st_imgdata/{}.jpg'.format(prd_no) object_roi = item['__object_roi__'].strip().split(',') category_id = item['__mctgr_no__'] roi = parser.get_roi_meta_dic(object_roi) start_loading = time.time() image = agent.load_image_roi(fname, roi, 0) elapsed_loading = time.time() - start_loading #roi_pil, image_pil = agent.draw_roi(fname, roi, 0) #roi_pil.save('roi.png')
import searcher import data_load import indexer import WEBcrawler import weather weather.weather() visit_url.visit_url() indexer.indexer() searcher.searcher() data_load.traverser() d = indexer.indexer("raw_data.pickle","shelve") searcher.search(d)
import indexer import tokenizer #import timeweighted_postdata as postdata import postdata # Can use by importing this file and using query.getScore("Some title of a post") idx = indexer.indexer() ct = 0 for i,p in postdata.posts.items(): if "title" in p: # some posts don't have titles (means they were deleted/some other error happened to them) title = p["title"].encode('ascii','ignore') selftext = p["selftext"].encode('ascii','ignore') tokens = tokenizer.tokenize(title) idx.addDocument(p["id"], tokens) ct += 1 print("Indexed {0} documents".format(ct)) idx.makeDocumentVectors() def getScore(newtitle): query = tokenizer.tokenize(newtitle) res = idx.queryVector(query, 1) #print("{0} results.".format(len(res))) # Take average of (upvotes-downvotes) weighted by similarity score ^ 2 # but only for posts with simscore > max(simscore)/2 totalweight = 0.0
import json from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk import glob import os.path import time from indexer import indexer from __init__ import DIR INDEX = 'reach' BULK=True if __name__ == '__main__': start = time.time() print "Indexing species..." indexer(INDEX, "events", 'species.json', BULK) end = time.time() print 'Elapsed time: %f' % (end-start)
for t in temp : if ( t == "and" ) : op = "and" elif ( t != "or" ) : temp1.add(t) print( "Performing '" + op.upper() + "' search for: " + str(temp1) ) out = list(temp1) try : page = urllib.request.urlopen("http://api.openweathermap.org/data/2.5/weather?q="+"06516") code = page.getcode() if(code == 200 ) : content=page.read() content_string = content.decode("utf-8") json_data = json.loads(content_string) name = json_data["name"] weather = json_data["weather"][0]["main"] sun_rise = json_data["sys"]["sunrise"] sun_set = json_data["sys"]["sunset"] except URLError as e : print("error") dictionary_data = indexer.indexer() print() print("location : " + str(name) + " Weather : " + str(weather) + " Sun Rise : " + str(sun_rise) + " Sun Set : " + str(sun_set)) print() searcher.search(dictionary_data,out,op)
indexer._aspect2fieldtype = dict(zaglavie=indexer.ngram, avtor=indexer.ngram) # text, # text, import sys import optz optz.str( "tyrsi", help="търси в поле=стойност; полета: " + str(ixx.schema._fields.keys()) ) # schindexer._aspect2fieldtype.keys()) ) optz.bool("novo", help="Създава индекса") optz.bool("dump", help="всички данни") optz.str("index", help="избор на друг индекс") if INFO: info.main(opts2, sys.argv[1:] or ["."]) ix = indexer() parcheta = [] for x in info.vse.values(): if x.etiketi.koren: continue parcheta.append( DictAttr( fname=x.fname, zaglavie=x.ime, avtor=", ".join(razdeli_kamila2(a) for a in x.etiketi.avtor), # html4index dir=True, ) ) # if len(parcheta)>5: # break if parcheta:
def distance(origin,destination): x=indexer.indexer(origin) y=indexer.indexer(destination) return dist.dist[x][y]
def test_dog(self): s = 'dogcatcatcodecatdog' words = ["cat", "dog"] self.assertEqual({0, 13}, indexer(string=s, words=words))
import sys, re import indexer db = indexer.indexer() DEBUG = ("-debug" in sys.argv) #apps = db.load() apps = [ "1", "apple", "apple-II", "app", "Apple-III", "app4", "app5", "app6", "app7", "app8", "app9", "app10" ] #searches the existing list of items def search(s): results = [] if (DEBUG): print("INFO: searching for \"" + s + "\" in known apps.") resultsbox.delete(0, END) #crappy linear search for i in apps: #regex match result = re.search(r'(.*)' + s.lower() + '(.*?)', i.lower()) #if result and (hardcoded limit) is <= 5 if result and len(results) < 10: results.append(i) resultsbox.insert(END, i) #gui stuff from Tkinter import *