def create_fields(opt): spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl'] if opt.src_lang not in spacy_langs: print('invalid src language: ' + opt.src_lang + 'supported languages : ' + spacy_langs) if opt.trg_lang not in spacy_langs: print('invalid trg language: ' + opt.trg_lang + 'supported languages : ' + spacy_langs) print("loading spacy tokenizers...") t_src = tokenize(opt.src_lang) t_trg = tokenize(opt.trg_lang) TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>') SRC = data.Field(lower=True, tokenize=t_src.tokenizer) if opt.load_weights is not None: try: print("loading presaved fields...") SRC = pickle.load(open(f'{opt.load_weights}/SRC.pkl', 'rb')) TRG = pickle.load(open(f'{opt.load_weights}/TRG.pkl', 'rb')) except: print("error opening SRC.pkl and TXT.pkl field files, please ensure they are in " + opt.load_weights + "/") quit() return(SRC, TRG)
def teste_arquivo( self ): arquivo = open("input.txt"); input = arquivo.readline() resposta = ["Hoje", "tem", "dojo-nilc", "."]; self.assertEqual( tokenize( input ), resposta ); print tokenize( input ) arquivo.close()
def create_fields(opt): timer = Timer() spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl'] if opt.src_lang not in spacy_langs: print('invalid src language: ' + opt.src_lang + 'supported languages : ' + spacy_langs) if opt.trg_lang not in spacy_langs: print('invalid trg language: ' + opt.trg_lang + 'supported languages : ' + spacy_langs) lang_compatibility = { 'en': 'en_core_web_md', 'pt': 'pt_core_news_md', 'fr': 'fr_core_news_md', 'de': 'de_core_news_md', 'es': 'es_core_news_md' } src_lang = lang_compatibility[opt.src_lang] trg_lang = lang_compatibility[opt.trg_lang] print("loading spacy tokenizers...") try: t_src = tokenize(src_lang) t_trg = tokenize(trg_lang) except Exception as e: print( f'Reached exception {e}.\n Please download model using python -m spacy download.' ) TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>') SRC = data.Field(lower=True, tokenize=t_src.tokenizer) if opt.load_weights is not None: try: print("loading presaved fields...") # import ipdb; ipdb.set_trace() SRC = pickle.load(open(f'{opt.load_weights}/SRC.pkl', 'rb')) TRG = pickle.load(open(f'{opt.load_weights}/TRG.pkl', 'rb')) except: traceback.print_exc(file=sys.stdout) print( "error opening SRC.pkl and TXT.pkl field files, please ensure they are in " + opt.load_weights + "/") quit() timer.print_time('create_fields') return (SRC, TRG)
def read_data(self): self.english_data = open('data/english.txt').read() self.french_data = open('data/french.txt').read() # Create the objects of tokenize en_tokenizer = tokenize('en') fr_tokenizer = tokenize('fr') # Field is used for pre-processing and post-processing self.SRC = data.Field(lower=True, tokenize=en_tokenizer.tokenizer) self.TRG = data.Field(lower=True, tokenize=fr_tokenizer.tokenizer, init_token='<sos>', eos_token='<eos>')
def scraper(url, resp): if resp.status >= 400 and resp.status < 600: return list() if resp.status >= 600 and resp.status <= 608: return list() if resp.status == 200 and resp.raw_response is None: return list() soup = BeautifulSoup(resp.raw_response.content, features="lxml") links = extract_next_links(url, soup) contentText = "" for contentGroup in soup.find_all(['p', 'title', re.compile(r"^h[0-9]+$")]): for string in contentGroup.stripped_strings: contentText += string + ' ' try: urlsFile = open("frontier.shelve.urls.txt","a+", encoding = "utf-8") urlsFile.write(str(url) + " " + str(tokenize(contentText)) + '\n') result = [] for link in links: if is_valid(link): result.append(link) else: urlsFile.write(str(link) + " -1\n") finally: urlsFile.close() return result;
def executeHelper(self, message): """ ad hoc for the left outer join bit the uses it which requires the commit deepcopying to be turned off! """ from collections import deque message = self.encode(message) tokens = tokenize(message) tokens = deque(tokens) if tokens[0] == "CREATE": self.createTable(tokens) elif tokens[0] == "INSERT": self.insert(tokens) elif tokens[0] == "UPDATE": return self.update(tokens) elif tokens[0] == "DELETE": return self.delete(tokens) elif message.find("LEFT OUTER JOIN") != -1: return self.leftOuterJoin(tokens) elif tokens[0] == "SELECT": return self.select(tokens) return [ ] # returns empty list is select statement was made, otherwise returns list of tuples. convert rows from lists to tuples
def allCitiesFreq(path): cityFileSetIn = open('cityFileSets', 'rb') cityFileSet = pickle.load(cityFileSetIn) cityFileSetIn.close() TotalHotels = 0 for city in cityFileSet: files = cityFileSet[city] if len(files)<50 or city == 'noCity': continue TotalHotels += len(cityFileSet[city]) if not os.path.isfile('./FreqResults/allCitiesFreq'): freqDict = dict() TotalReviews = 0 currentHotel = 0 for city in cityFileSet: files = cityFileSet[city] if len(files)<50 or city == 'noCity': continue for file in files: j_hotel = open(path+file) hotel = json.load(j_hotel) currentHotel += 1 stopset = hotelNameAddress(hotel) for review in hotel.get('Reviews'): TotalReviews += 1 content = review.get('Content').encode('ascii', 'ignore') overall = float(review.get('Ratings').get('Overall')) tokens = tokenize(content, stopset) docLength = len(tokens) reviewCounter = Counter(tokens) for word in reviewCounter: wordFreq = float(reviewCounter[word])/float(docLength) if freqDict.has_key(word): freqDict[word]['countSum']+=reviewCounter[word] freqDict[word]['freqSum'] += wordFreq freqDict[word]['docCount'] += 1 if freqDict[word]['lastHotel'] != file: freqDict[word]['lastHotel'] = file freqDict[word]['hotelCount'] +=1 freqDict[word]['ratingSum'] += overall else: freqDict[word] = dict([('countSum', reviewCounter[word]),('freqSum', wordFreq), ('docCount', 1), ('hotelCount', 1), ('ratingSum', overall), ('lastHotel', file)]) currentProgress(currentHotel, TotalHotels) freqDict['TotalHotels'] = TotalHotels freqDict['TotalReviews'] = TotalReviews allCityFreqOut = open('./FreqResults/allCitiesFreq', 'wb') pickle.dump(freqDict, allCityFreqOut) allCityFreqOut.close() else: allCityFreqIn = open('./FreqResults/allCitiesFreq', 'rb') freqDict = pickle.load(allCityFreqIn) allCityFreqIn.close() return freqDict
def freq(path): if not os.path.isfile('./FreqResults/allFreq'): freqDict = dict() currentFile = 0 totalFiles = len(os.listdir(path)) TotalHotels = 0 TotalReviews = 0 for file in os.listdir(path): currentFile += 1 currentProgress(currentFile, totalFiles) j_hotel = open(path+file) hotel = json.load(j_hotel) stopset = hotelNameAddress(hotel) reviews = hotel.get('Reviews') if len(reviews)==0: continue else: TotalHotels +=1 for review in reviews: TotalReviews += 1 content = review.get('Content').encode('ascii', 'ignore') overall = float(review.get('Ratings').get('Overall')) tokens = tokenize(content, stopset) docLength = len(tokens) reviewCounter = Counter(tokens) for word in reviewCounter: wordFreq = float(reviewCounter[word])/float(docLength) if freqDict.has_key(word): freqDict[word]['countSum'] += wordFreq freqDict[word]['docCount'] += 1 freqDict[word]['ratingSum'] += overall if freqDict[word]['lastHotel'] != file: freqDict[word]['lastHotel'] = file freqDict[word]['hotelCount'] +=1 else: freqDict[word] = dict([('countSum', wordFreq), ('docCount', 1), ('hotelCount', 1), ('ratingSum', overall), ('lastHotel', file)]) freqDict['TotalHotels'] = TotalHotels freqDict['TotalReviews'] = TotalReviews allFreqOut = open('./FreqResults/allFreq', 'wb') pickle.dump(freqDict, allFreqOut) allFreqOut.close() else: allFreqIn = open('./FreqResults/allFreq', 'rb') freqDict = pickle.load(allFreqIn) allFreqIn.close() return freqDict
def create_fields(opt): print("loading spacy tokenizers...") t_src = tokenize(opt.src_lang) t_trg = tokenize(opt.trg_lang) TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>') SRC = data.Field(lower=True, tokenize=t_src.tokenizer) if opt.load_weights is not None: try: print("loading presaved fields...") SRC = pickle.load(open(f'{opt.load_weights}/SRC.pkl', 'rb')) TRG = pickle.load(open(f'{opt.load_weights}/TRG.pkl', 'rb')) except: print("error opening SRC.pkl and TXT.pkl field files, please ensure they are in " + opt.load_weights + "/") quit() return(SRC, TRG)
def preprocess(dataset, algorithm): message = "" message+=PREPROCESS_BEGIN_MESSAGE+'\n' print (PREPROCESS_BEGIN_MESSAGE) check, stopWords, loadMessage = loadStopWords("/var/www/html/winifier/server/__server_python__/__preprocess__/stop_words/stanford_core_nlp_stopWords.txt") message+=loadMessage+'\n' if check: tokenizeCheck, tokenList, tokenizeMessage = tokenize(dataset, algorithm) message+=tokenizeMessage+'\n' if tokenizeCheck: removalCheck, pureTokens, removeStMessage = removeStopWords(tokenList, stopWords) message+=removeStMessage+'\n' if removalCheck: stemCheck, stemmedTokens, stemMessage = stemming(pureTokens) message+=stemMessage +'\n' if stemCheck: labelListCheck, labelList, pointsList, labelPtMessage = getLabelsAndPoints(dataset) message+=labelPtMessage+'\n' if labelListCheck: message+=PREPROCESS_SUCCESS_MESSAGE+'\n' print(PREPROCESS_SUCCESS_MESSAGE) with open(getPath()+"/__preprocess__/messages.log","a") as log: log.write(message) return True, stemmedTokens, pointsList, labelList else: message+=PREPROCESS_ERROR_MESSAGE+'\n' print(PREPROCESS_ERROR_MESSAGE) with open(getPath()+"/__preprocess__/messages.log","a") as log: log.write(message) return False, None, None, None else: message+=PREPROCESS_ERROR_MESSAGE print(PREPROCESS_ERROR_MESSAGE) with open(getPath()+"/__preprocess__/messages.log","a") as log: log.write(message) return False, None, None, None
def teste_Nove( self ): input = "A banana custa R$2,50." resposta = ["A", "banana", "custa", "R$", "2,50", "."]; self.assertEqual( tokenize( input ), resposta );
def execute(self, message): """ :param message: sql statement string :return: list of tuples of rows. Or blank, if it's a sql statement that doesn't return anything """ from collections import deque message = self.encode(message) tokens = tokenize(message) tokens = deque(tokens) ### check2 if the proper locks are held if self.commitMode == 'AUTOCOMMIT': # checkout the latest version of the database self.checkout() returnObject = None if tokens[0] == "BEGIN": self.begin(tokens) elif tokens[0] == "COMMIT": self.commit(tokens) elif tokens[0] == "CREATE": returnObject = self.createTable(tokens) elif tokens[0] == "DROP": returnObject = self.dropTable(tokens) elif tokens[0] == "INSERT": self.lock = Connection._ALL_DATABASES[self.filename].requestLock( self.name, 'RESERVED') returnObject = self.insert(tokens) elif tokens[0] == "UPDATE": self.lock = Connection._ALL_DATABASES[self.filename].requestLock( self.name, 'RESERVED') returnObject = self.update(tokens) elif tokens[0] == "DELETE": self.lock = Connection._ALL_DATABASES[self.filename].requestLock( self.name, 'RESERVED') returnObject = self.delete(tokens) elif message.find("LEFT OUTER JOIN") != -1: returnObject = self.leftOuterJoin(tokens) elif tokens[0] == "SELECT": self.lock = Connection._ALL_DATABASES[self.filename].requestLock( self.name, 'SHARED') returnObject = self.select(tokens) elif tokens[0] == "ROLLBACK": returnObject = self.rollback(tokens) # if in autocommit mode, then commit by updating the global database if self.commitMode == "AUTOCOMMIT": if self.lock in ['RESERVED', 'EXCLUSIVE']: self.lock = Connection._ALL_DATABASES[ self.filename].requestLock(self.name, 'EXCLUSIVE') self.publishChanges() return returnObject if returnObject is not None else []
stopset = hotelNameAddress(hotel) reviews = hotel.get('Reviews') if len(reviews)==0: continue for review in reviews: content = review.get('Content').encode('ascii', 'ignore') sentTokens = sent_tokenize(content) #strippedTokens = removeStopWords(allTokens, stopset) #d = Counter(strippedTokens) #count.update(d) for s in sentTokens: trans += [tokenize(s, stopset)] transOut = open('trans.dat', 'wb') pickle.dump(trans, transOut) transOut.close() else: transIn = open('trans.dat', 'rb') trans = pickle.load(transIn) transIn.close() support = ceil(0.01*TotalReviews)/len(trans)*25 feats = apriori(trans, zmin = setmin, zmax= setmax,supp=support, conf = confidence, target='r', report = 'CS') with open('./FeatResults/allHotels_z'+str(setmax)+'_s'+str(support)[0:5]+'_c'+str(confidence)+'.csv', 'wb') as csvfile: foutwriter = csv.writer(csvfile, dialect = 'excel') header = ['word1', 'word2', 'word3', 'confidence', 'support', 'W1freq', 'W2freq', 'W3freq', 'W1tfidf', 'W2tfidf', 'W3tfidf', 'W1tfihf', 'W2tfihf', 'W3tfihf','W1idf', 'W2idf', 'W3idf', 'W1ihf', 'W2ihf', 'W3ihf', 'W1avgRevScore', 'W2avgRevScore', 'W3avgRevScore', 'sentiment', 'W1Sent', 'W2Sent', 'W3Sent','W1POS', 'W2POS', 'W3POS']
from pprint import pprint from Tokenize import tokenize from Analyze import treeBuilder from Analyze import countTree from Analyze import printTree from Diff import diffTree from Diff import simplifyTree from Diff import simplifyVar from Output import latexOutput print("============================================") print("\tPyDifferiantiator v. 0.1") print("============================================") input_str = input("Enter the expression >>> ") # Get a list of tokens token_list = tokenize(input_str) # Debug - print all tokens with their type and value # pprint(token_list) # Get the tree of this expresion _builder = treeBuilder(token_list) tree_head = _builder.getHead() # Debug - print the original tree # print("The original tree") # print(printTree(tree_head)) diff_head = diffTree(tree_head) # Not the best solution, but good for small expressions
def teste_dois(self): input = "hoje tem dojo"; resposta = ["hoje", "tem", "dojo"]; self.assertEqual( tokenize( input ), resposta, "Não passou no teste 2");
def teste_dez( self ): input = "Hoje tem dojo-nilc"; resposta = ["Hoje", "tem", "dojo-nilc"]; self.assertEqual( tokenize( input ), resposta );
return k, cls Internal.internals['compFunc'] = compFunc if __name__ == '__main__': import sys, traceback, builtins from Preprocess import preprocess from Tokenize import tokenize from Parse import parse for parm in sys.argv[1:]: setattr(builtins, parm, True) fin = open(sys.argv[1], 'r') tree = parse(tokenize(preprocess(fin.read()), sys.argv[1]), sys.argv[1]) fin.close() #print(tree) # def disp(x): # print(len(traceback.extract_stack()), x) # raise Internal.Halt() #This is dumb. newTree = [] for xpr in tree: if isa(xpr, KlipList): if len(xpr) and xpr[0] == Sym('include'): fin = open(xpr[1], 'r') newTree += parse(tokenize(preprocess(fin.read()), xpr[1]), xpr[1])
def teste_Oito( self ): input = "hoje tem dojo 11.2." resposta = ["hoje", "tem", "dojo", "11.2", "."]; self.assertEqual( tokenize( input ), resposta );
def teste_Seis( self ): input = "hoje tem dojo 1.2" resposta = ["hoje", "tem", "dojo", "1.2"]; self.assertEqual( tokenize( input ), resposta );
def teste_Cinco( self ): input = "hoje tem dojo!" resposta = ["hoje", "tem", "dojo", "!"]; self.assertEqual( tokenize( input ), resposta );
def teste_Quatro( self ): input = "hoje tem dojo" resposta = ["hoje", "tem", "dojo"]; self.assertEqual( tokenize( input ), resposta );
def teste_Tres( self ): input = " " resposta = []; self.assertEqual( tokenize( input ), resposta );
def teste_Dollar( self ): input = "A banana custa US$2,50." resposta = ["A", "banana", "custa", "US$", "2,50", "."]; self.assertEqual( tokenize( input ), resposta );
return f return Sym(self.cur.value) def __call__(self, tokens, fname): self.fname = fname self.input = tokens self.it = iter(tokens) self.inc(None) temp = KlipList() while self.cur: temp.append(self.parseTerminal()) self.inc(None) return temp parse = Parser() if __name__ == '__main__': from Preprocess import preprocess from Tokenize import tokenize import sys, pprint fin = open(sys.argv[1], 'r') tree = parse(tokenize(preprocess(fin.read()), sys.argv[1]), sys.argv[1]) fin.close() print(tree)