Exemplo n.º 1
0
def create_fields(opt):
    
    spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl']
    if opt.src_lang not in spacy_langs:
        print('invalid src language: ' + opt.src_lang + 'supported languages : ' + spacy_langs)  
    if opt.trg_lang not in spacy_langs:
        print('invalid trg language: ' + opt.trg_lang + 'supported languages : ' + spacy_langs)
    
    print("loading spacy tokenizers...")
    
    t_src = tokenize(opt.src_lang)
    t_trg = tokenize(opt.trg_lang)

    TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
    SRC = data.Field(lower=True, tokenize=t_src.tokenizer)

    if opt.load_weights is not None:
        try:
            print("loading presaved fields...")
            SRC = pickle.load(open(f'{opt.load_weights}/SRC.pkl', 'rb'))
            TRG = pickle.load(open(f'{opt.load_weights}/TRG.pkl', 'rb'))
        except:
            print("error opening SRC.pkl and TXT.pkl field files, please ensure they are in " + opt.load_weights + "/")
            quit()
        
    return(SRC, TRG)
Exemplo n.º 2
0
 def teste_arquivo( self ):        
     arquivo = open("input.txt");
     input = arquivo.readline()       
     resposta = ["Hoje", "tem", "dojo-nilc", "."];
     self.assertEqual( tokenize( input ), resposta );
 
     print tokenize( input )
     
     arquivo.close()
Exemplo n.º 3
0
def create_fields(opt):
    timer = Timer()

    spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl']
    if opt.src_lang not in spacy_langs:
        print('invalid src language: ' + opt.src_lang +
              'supported languages : ' + spacy_langs)
    if opt.trg_lang not in spacy_langs:
        print('invalid trg language: ' + opt.trg_lang +
              'supported languages : ' + spacy_langs)

    lang_compatibility = {
        'en': 'en_core_web_md',
        'pt': 'pt_core_news_md',
        'fr': 'fr_core_news_md',
        'de': 'de_core_news_md',
        'es': 'es_core_news_md'
    }

    src_lang = lang_compatibility[opt.src_lang]
    trg_lang = lang_compatibility[opt.trg_lang]

    print("loading spacy tokenizers...")

    try:
        t_src = tokenize(src_lang)
        t_trg = tokenize(trg_lang)
    except Exception as e:
        print(
            f'Reached exception {e}.\n Please download model using python -m spacy download.'
        )

    TRG = data.Field(lower=True,
                     tokenize=t_trg.tokenizer,
                     init_token='<sos>',
                     eos_token='<eos>')
    SRC = data.Field(lower=True, tokenize=t_src.tokenizer)

    if opt.load_weights is not None:
        try:
            print("loading presaved fields...")
            # import ipdb; ipdb.set_trace()
            SRC = pickle.load(open(f'{opt.load_weights}/SRC.pkl', 'rb'))
            TRG = pickle.load(open(f'{opt.load_weights}/TRG.pkl', 'rb'))
        except:
            traceback.print_exc(file=sys.stdout)
            print(
                "error opening SRC.pkl and TXT.pkl field files, please ensure they are in "
                + opt.load_weights + "/")
            quit()

    timer.print_time('create_fields')

    return (SRC, TRG)
	def read_data(self):

		self.english_data = open('data/english.txt').read()
		self.french_data = open('data/french.txt').read()

		# Create the objects of tokenize
		en_tokenizer = tokenize('en')
		fr_tokenizer = tokenize('fr')

		# Field is used for pre-processing and post-processing
		self.SRC = data.Field(lower=True, tokenize=en_tokenizer.tokenizer)
		self.TRG = data.Field(lower=True, tokenize=fr_tokenizer.tokenizer, init_token='<sos>', eos_token='<eos>')
Exemplo n.º 5
0
def scraper(url, resp):
    if resp.status >= 400 and resp.status < 600:
        return list()
    if resp.status >= 600 and resp.status <= 608:
        return list()
    if resp.status == 200 and resp.raw_response is None:
        return list()
    soup = BeautifulSoup(resp.raw_response.content, features="lxml")
    links = extract_next_links(url, soup)
    contentText = ""
    for contentGroup in soup.find_all(['p', 'title', re.compile(r"^h[0-9]+$")]):
        for string in contentGroup.stripped_strings:
            contentText += string + ' '
    try:
        urlsFile = open("frontier.shelve.urls.txt","a+", encoding = "utf-8")
        urlsFile.write(str(url) + " " + str(tokenize(contentText)) + '\n')
        result = []
        for link in links:
            if is_valid(link):
                result.append(link)
            else:
                urlsFile.write(str(link) + " -1\n")
    finally:
        urlsFile.close()

    return result;
Exemplo n.º 6
0
    def executeHelper(self, message):
        """
        ad hoc for the left outer join bit the uses it which requires the commit deepcopying to be turned off!
        """

        from collections import deque

        message = self.encode(message)

        tokens = tokenize(message)

        tokens = deque(tokens)

        if tokens[0] == "CREATE":
            self.createTable(tokens)

        elif tokens[0] == "INSERT":
            self.insert(tokens)

        elif tokens[0] == "UPDATE":
            return self.update(tokens)

        elif tokens[0] == "DELETE":
            return self.delete(tokens)

        elif message.find("LEFT OUTER JOIN") != -1:
            return self.leftOuterJoin(tokens)

        elif tokens[0] == "SELECT":
            return self.select(tokens)

        return [
        ]  # returns empty list is select statement was made, otherwise returns list of tuples. convert rows from lists to tuples
Exemplo n.º 7
0
def allCitiesFreq(path):
    cityFileSetIn = open('cityFileSets', 'rb')
    cityFileSet = pickle.load(cityFileSetIn)
    cityFileSetIn.close()
    TotalHotels = 0
    for city in cityFileSet:
        files = cityFileSet[city]
        if len(files)<50 or city == 'noCity':
            continue
        TotalHotels += len(cityFileSet[city])
        
    if not os.path.isfile('./FreqResults/allCitiesFreq'):
        freqDict = dict()
        TotalReviews = 0
        currentHotel = 0
        
        for city in cityFileSet:
            files = cityFileSet[city]
            if len(files)<50 or city == 'noCity':
                continue
            for file in files:
                j_hotel = open(path+file)
                hotel = json.load(j_hotel)
                currentHotel += 1
                stopset = hotelNameAddress(hotel)
                
                for review in hotel.get('Reviews'):
                    TotalReviews += 1
                    content = review.get('Content').encode('ascii', 'ignore')
                    overall = float(review.get('Ratings').get('Overall'))
                    tokens = tokenize(content, stopset)
                    docLength = len(tokens)
                    reviewCounter = Counter(tokens)
    
                    for word in reviewCounter:
                        wordFreq = float(reviewCounter[word])/float(docLength)
                        if freqDict.has_key(word):
                           freqDict[word]['countSum']+=reviewCounter[word]
                           freqDict[word]['freqSum'] += wordFreq
                           freqDict[word]['docCount'] += 1
                           if freqDict[word]['lastHotel'] != file:
                               freqDict[word]['lastHotel'] = file
                               freqDict[word]['hotelCount'] +=1
                           freqDict[word]['ratingSum'] += overall
                        else:
                            freqDict[word] = dict([('countSum', reviewCounter[word]),('freqSum', wordFreq), ('docCount', 1), ('hotelCount', 1), ('ratingSum', overall), ('lastHotel', file)])
                    currentProgress(currentHotel, TotalHotels)
        freqDict['TotalHotels'] = TotalHotels
        freqDict['TotalReviews'] = TotalReviews
        allCityFreqOut = open('./FreqResults/allCitiesFreq', 'wb')
        pickle.dump(freqDict, allCityFreqOut)
        allCityFreqOut.close()
    else:
        allCityFreqIn = open('./FreqResults/allCitiesFreq', 'rb')
        freqDict = pickle.load(allCityFreqIn)
        allCityFreqIn.close()
    
    return freqDict
Exemplo n.º 8
0
def freq(path):
    if not os.path.isfile('./FreqResults/allFreq'):    
        freqDict = dict()
        currentFile = 0
        totalFiles = len(os.listdir(path))
        TotalHotels = 0
        TotalReviews = 0
        
        for file in os.listdir(path):
            currentFile += 1
            currentProgress(currentFile, totalFiles)
           
            
            j_hotel = open(path+file)
            hotel = json.load(j_hotel)
            
            stopset = hotelNameAddress(hotel)
            reviews = hotel.get('Reviews')
        
            if len(reviews)==0:
                continue
            else:
                TotalHotels +=1
            
            for review in reviews:
                
                TotalReviews += 1
                content = review.get('Content').encode('ascii', 'ignore')
                overall = float(review.get('Ratings').get('Overall'))
                tokens = tokenize(content, stopset)
                docLength = len(tokens)
                reviewCounter = Counter(tokens)
    
                for word in reviewCounter:
                    wordFreq = float(reviewCounter[word])/float(docLength)
                    if freqDict.has_key(word):
                        freqDict[word]['countSum'] += wordFreq
                        freqDict[word]['docCount'] += 1
                        freqDict[word]['ratingSum'] += overall                        
                        if freqDict[word]['lastHotel'] != file:
                            freqDict[word]['lastHotel'] = file
                            freqDict[word]['hotelCount'] +=1
                    else:
                        freqDict[word] = dict([('countSum', wordFreq), ('docCount', 1), ('hotelCount', 1), ('ratingSum', overall), ('lastHotel', file)])
    
        freqDict['TotalHotels'] = TotalHotels
        freqDict['TotalReviews'] = TotalReviews
        allFreqOut = open('./FreqResults/allFreq', 'wb')
        pickle.dump(freqDict, allFreqOut)
        allFreqOut.close()
    else:
        allFreqIn = open('./FreqResults/allFreq', 'rb')
        freqDict = pickle.load(allFreqIn)
        allFreqIn.close()
    
    return freqDict
        
    
Exemplo n.º 9
0
def create_fields(opt):
    print("loading spacy tokenizers...")

    t_src = tokenize(opt.src_lang)
    t_trg = tokenize(opt.trg_lang)

    TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
    SRC = data.Field(lower=True, tokenize=t_src.tokenizer)

    if opt.load_weights is not None:
        try:
            print("loading presaved fields...")
            SRC = pickle.load(open(f'{opt.load_weights}/SRC.pkl', 'rb'))
            TRG = pickle.load(open(f'{opt.load_weights}/TRG.pkl', 'rb'))
        except:
            print("error opening SRC.pkl and TXT.pkl field files, please ensure they are in " + opt.load_weights + "/")
            quit()

    return(SRC, TRG)
Exemplo n.º 10
0
def preprocess(dataset, algorithm):
    message = ""
    message+=PREPROCESS_BEGIN_MESSAGE+'\n'
    print (PREPROCESS_BEGIN_MESSAGE)
    check, stopWords, loadMessage = loadStopWords("/var/www/html/winifier/server/__server_python__/__preprocess__/stop_words/stanford_core_nlp_stopWords.txt")
    message+=loadMessage+'\n'
    if check:                
        tokenizeCheck, tokenList, tokenizeMessage = tokenize(dataset, algorithm)        
        message+=tokenizeMessage+'\n'
        if tokenizeCheck:
            removalCheck, pureTokens, removeStMessage = removeStopWords(tokenList, stopWords)
            message+=removeStMessage+'\n'
            if removalCheck:
                stemCheck, stemmedTokens, stemMessage = stemming(pureTokens)
                message+=stemMessage +'\n'
                if stemCheck:
                    labelListCheck, labelList, pointsList, labelPtMessage = getLabelsAndPoints(dataset)
                    message+=labelPtMessage+'\n'
                    if labelListCheck:
                        message+=PREPROCESS_SUCCESS_MESSAGE+'\n'
                        print(PREPROCESS_SUCCESS_MESSAGE)
                        with open(getPath()+"/__preprocess__/messages.log","a") as log:
                            log.write(message)
                        return True, stemmedTokens, pointsList, labelList
                    else:
                        message+=PREPROCESS_ERROR_MESSAGE+'\n'
                        print(PREPROCESS_ERROR_MESSAGE)
                        with open(getPath()+"/__preprocess__/messages.log","a") as log:
                            log.write(message)
                        return False, None, None, None
            else:
                message+=PREPROCESS_ERROR_MESSAGE
                print(PREPROCESS_ERROR_MESSAGE)
                with open(getPath()+"/__preprocess__/messages.log","a") as log:
                    log.write(message)
                return False, None, None, None
Exemplo n.º 11
0
 def teste_Nove( self ):
     input = "A banana custa R$2,50."
     resposta = ["A", "banana", "custa", "R$", "2,50", "."];
     self.assertEqual( tokenize( input ), resposta );
Exemplo n.º 12
0
    def execute(self, message):
        """
        :param message: sql statement string
        :return: list of tuples of rows. Or blank, if it's a sql statement that doesn't return anything
        """

        from collections import deque

        message = self.encode(message)

        tokens = tokenize(message)

        tokens = deque(tokens)

        ### check2 if the proper locks are held

        if self.commitMode == 'AUTOCOMMIT':
            # checkout the latest version of the database
            self.checkout()

        returnObject = None

        if tokens[0] == "BEGIN":
            self.begin(tokens)

        elif tokens[0] == "COMMIT":
            self.commit(tokens)

        elif tokens[0] == "CREATE":
            returnObject = self.createTable(tokens)

        elif tokens[0] == "DROP":
            returnObject = self.dropTable(tokens)

        elif tokens[0] == "INSERT":
            self.lock = Connection._ALL_DATABASES[self.filename].requestLock(
                self.name, 'RESERVED')
            returnObject = self.insert(tokens)

        elif tokens[0] == "UPDATE":
            self.lock = Connection._ALL_DATABASES[self.filename].requestLock(
                self.name, 'RESERVED')
            returnObject = self.update(tokens)

        elif tokens[0] == "DELETE":
            self.lock = Connection._ALL_DATABASES[self.filename].requestLock(
                self.name, 'RESERVED')
            returnObject = self.delete(tokens)

        elif message.find("LEFT OUTER JOIN") != -1:
            returnObject = self.leftOuterJoin(tokens)

        elif tokens[0] == "SELECT":
            self.lock = Connection._ALL_DATABASES[self.filename].requestLock(
                self.name, 'SHARED')
            returnObject = self.select(tokens)

        elif tokens[0] == "ROLLBACK":
            returnObject = self.rollback(tokens)

        # if in autocommit mode, then commit by updating the global database
        if self.commitMode == "AUTOCOMMIT":
            if self.lock in ['RESERVED', 'EXCLUSIVE']:
                self.lock = Connection._ALL_DATABASES[
                    self.filename].requestLock(self.name, 'EXCLUSIVE')

            self.publishChanges()

        return returnObject if returnObject is not None else []
        
        stopset = hotelNameAddress(hotel)
        reviews = hotel.get('Reviews')
    
        if len(reviews)==0:
            continue
        
        for review in reviews:
            
            content = review.get('Content').encode('ascii', 'ignore')
            sentTokens = sent_tokenize(content)
            #strippedTokens = removeStopWords(allTokens, stopset)
            #d = Counter(strippedTokens)
            #count.update(d)
            for s in sentTokens:
                trans += [tokenize(s, stopset)]

    transOut = open('trans.dat', 'wb')
    pickle.dump(trans, transOut)
    transOut.close()
else:
    transIn = open('trans.dat', 'rb')
    trans = pickle.load(transIn)
    transIn.close()
support = ceil(0.01*TotalReviews)/len(trans)*25

feats = apriori(trans, zmin = setmin, zmax= setmax,supp=support, conf = confidence, target='r', report = 'CS')

with open('./FeatResults/allHotels_z'+str(setmax)+'_s'+str(support)[0:5]+'_c'+str(confidence)+'.csv', 'wb') as csvfile:
    foutwriter = csv.writer(csvfile, dialect = 'excel')
    header = ['word1', 'word2', 'word3', 'confidence', 'support', 'W1freq', 'W2freq', 'W3freq', 'W1tfidf', 'W2tfidf', 'W3tfidf', 'W1tfihf', 'W2tfihf', 'W3tfihf','W1idf', 'W2idf', 'W3idf', 'W1ihf', 'W2ihf', 'W3ihf', 'W1avgRevScore', 'W2avgRevScore', 'W3avgRevScore', 'sentiment', 'W1Sent', 'W2Sent', 'W3Sent','W1POS', 'W2POS', 'W3POS']
Exemplo n.º 14
0
from pprint import pprint
from Tokenize import tokenize
from Analyze import treeBuilder
from Analyze import countTree
from Analyze import printTree
from Diff import diffTree
from Diff import simplifyTree
from Diff import simplifyVar
from Output import latexOutput

print("============================================")
print("\tPyDifferiantiator v. 0.1")
print("============================================")
input_str = input("Enter the expression >>> ")
# Get a list of tokens
token_list = tokenize(input_str)

# Debug - print all tokens with their type and value
# pprint(token_list)

# Get the tree of this expresion
_builder = treeBuilder(token_list)
tree_head = _builder.getHead()

# Debug - print the original tree
# print("The original tree")
# print(printTree(tree_head))

diff_head = diffTree(tree_head)

# Not the best solution, but good for small expressions
Exemplo n.º 15
0
 def teste_dois(self):
     input = "hoje tem dojo";
     resposta = ["hoje", "tem", "dojo"];
     self.assertEqual( tokenize( input ), resposta, "Não passou no teste 2");
Exemplo n.º 16
0
 def teste_dez( self ):
     input = "Hoje tem dojo-nilc";
     resposta = ["Hoje", "tem", "dojo-nilc"];
     self.assertEqual( tokenize( input ), resposta );
Exemplo n.º 17
0
	return k, cls

Internal.internals['compFunc'] = compFunc


if __name__ == '__main__':
	import sys, traceback, builtins
	from Preprocess import preprocess
	from Tokenize import tokenize
	from Parse import parse
	
	for parm in sys.argv[1:]:
		setattr(builtins, parm, True)
	
	fin = open(sys.argv[1], 'r')
	tree = parse(tokenize(preprocess(fin.read()), sys.argv[1]), sys.argv[1])
	fin.close()
	
	#print(tree)

	# def disp(x):
		# print(len(traceback.extract_stack()), x)
		# raise Internal.Halt()
	
	#This is dumb.
	newTree = []
	for xpr in tree:
		if isa(xpr, KlipList):
			if len(xpr) and xpr[0] == Sym('include'):
				fin = open(xpr[1], 'r')
				newTree += parse(tokenize(preprocess(fin.read()), xpr[1]), xpr[1])
Exemplo n.º 18
0
 def teste_Oito( self ):
     input = "hoje tem dojo 11.2."
     resposta = ["hoje", "tem", "dojo", "11.2", "."];
     self.assertEqual( tokenize( input ), resposta );
Exemplo n.º 19
0
 def teste_Seis( self ):
     input = "hoje tem dojo 1.2"
     resposta = ["hoje", "tem", "dojo", "1.2"];
     self.assertEqual( tokenize( input ), resposta );        
Exemplo n.º 20
0
 def teste_Cinco( self ):
     input = "hoje tem dojo!"
     resposta = ["hoje", "tem", "dojo", "!"];
     self.assertEqual( tokenize( input ), resposta );
Exemplo n.º 21
0
 def teste_Quatro( self ):
     input = "hoje  tem    dojo"
     resposta = ["hoje", "tem", "dojo"];
     self.assertEqual( tokenize( input ), resposta );       
Exemplo n.º 22
0
 def teste_Tres( self ):
     input = "     "
     resposta = [];
     self.assertEqual( tokenize( input ), resposta );
Exemplo n.º 23
0
 def teste_Dollar( self ):
     input = "A banana custa US$2,50."
     resposta = ["A", "banana", "custa", "US$", "2,50", "."];
     self.assertEqual( tokenize( input ), resposta );
Exemplo n.º 24
0
            return f

        return Sym(self.cur.value)

    def __call__(self, tokens, fname):
        self.fname = fname
        self.input = tokens
        self.it = iter(tokens)
        self.inc(None)

        temp = KlipList()

        while self.cur:
            temp.append(self.parseTerminal())
            self.inc(None)

        return temp


parse = Parser()

if __name__ == '__main__':
    from Preprocess import preprocess
    from Tokenize import tokenize
    import sys, pprint
    fin = open(sys.argv[1], 'r')
    tree = parse(tokenize(preprocess(fin.read()), sys.argv[1]), sys.argv[1])
    fin.close()

    print(tree)