示例#1
0
    def newenvironment(self, name, nargs=0, definition=None, opt=None):
        """ 
        Create a \\newenvironment 

        Required Arguments:
        name -- name of the macro to create
        nargs -- integer number of arguments that the macro has
        definition -- two-element tuple containing the LaTeX definition.
            Each element should be a string.  The first element 
            corresponds to the beginning of the environment, and the
            second element is the end of the environment.
        opt -- string containing the LaTeX code to use in the 
            optional argument

        Examples::
            c.newenvironment('mylist', 0, (r'\\begin{itemize}', r'\\end{itemize}'))

        """
        name = str(name)
        # Macro already exists
        if self.has_key(name):
            if not issubclass(self[name],
                              (plasTeX.NewCommand, plasTeX.Definition)):
                return
            macrolog.debug('redefining environment "%s"', name)

        if nargs is None:
            nargs = 0
        assert isinstance(nargs, int), 'nargs must be an integer'

        if definition is not None:
            assert isinstance(definition, (tuple,list)), \
                'definition must be a list or tuple'
            assert len(definition) == 2, 'definition must have 2 elements'

            if isinstance(definition[0], basestring):
                definition[0] = [x for x in Tokenizer(definition[0], self)]
            if isinstance(definition[1], basestring):
                definition[1] = [x for x in Tokenizer(definition[1], self)]

        if isinstance(opt, basestring):
            opt = [x for x in Tokenizer(opt, self)]

        macrolog.debug('creating newenvironment %s', name)

        # Begin portion
        newclass = new.classobj(name, (plasTeX.NewCommand, ), {
            'nargs': nargs,
            'opt': opt,
            'definition': definition[0]
        })
        self.addGlobal(name, newclass)

        # End portion
        newclass = new.classobj('end' + name, (plasTeX.NewCommand, ), {
            'nargs': 0,
            'opt': None,
            'definition': definition[1]
        })
        self.addGlobal('end' + name, newclass)
示例#2
0
def test_issues__semicolon_missing_from_outfile():
    fake_file = BytesIO(b"""
      var char key;  // the key currently pressed by the user
      var boolean exit;
      let exit = false;
    """)

    t = Tokenizer(fake_file)
    assert (t.advance().token == 'var')
    assert (t.advance().token == 'char')
    assert (t.advance().token == 'key')
    assert (t.advance().token == ';')

    assert (t.advance().token == 'var')
    assert (t.advance().token == 'boolean')
    assert (t.advance().token == 'exit')
    assert (t.advance().token == ';')

    assert (t.advance().token == 'let')
    assert (t.advance().token == 'exit')
    assert (t.advance().token == '=')
    assert (t.advance().token == 'false')
    assert (t.advance().token == ';')

    fake_file = BytesIO(b"        return();")
    t = Tokenizer(fake_file)
    assert (t.advance().token == 'return')
    assert (t.advance().token == '(')
    assert (t.advance().token == ')')
    assert (t.advance().token == ';')
示例#3
0
    def runTokenizer(self):
        """Prepare the Files for the tokenizer"""

        inFileSource = re.sub(
            ".txt$", "",
            self.fInput) + "-" + self.pDict["sourceLanguage"] + ".txt"
        tokFileSource = re.sub(".txt$", "-tok.txt", inFileSource)

        inFileTarget = re.sub(
            ".txt$", "",
            self.fInput) + "-" + self.pDict["targetLanguage"] + ".txt"
        tokFileTarget = re.sub(".txt$", "-tok.txt", inFileTarget)

        self.populate(inFileSource, inFileTarget)

        tokenizer = Tokenizer(inFileSource, tokFileSource,
                              self.pDict["sourceLanguage"])
        tokenizer.execute()

        tokenizer = Tokenizer(inFileTarget, tokFileTarget,
                              self.pDict["targetLanguage"])
        tokenizer.execute()

        self.pDict["tokFileSource"] = tokFileSource
        self.pDict["tokFileTarget"] = tokFileTarget

        #-----Clean the intermediate Files------
        os.remove(inFileSource)
        os.remove(inFileTarget)
示例#4
0
    def build_index(self):
        '''
        This function build the inverted index, it inserts the url to the
        doc Table with a doc_id, and insert each token to tokenT table
        and insert token, doc_id, term frequency and weight into the web_index
        Table
        '''

        c = Corpus()
        t = Tokenizer()

        for url, name in c.get_file_name():
            if len(url) > 1000:
                continue
            result = t.tokenize(name)
            if len(result) == 0:
                continue
            print(url)
            doc_id = 1

            #Insert URL to table DOC
            sql = "INSERT INTO web.doc(url) values (%s)"
            val = (url, )
            self.mycursor.execute(sql, val)
            self.mydb.commit()

            print(self.mycursor.rowcount, "was inserted in URL.")

            print(url)
            s_sql = "select id from doc where url=%s"
            self.mycursor.execute(s_sql, val)
            myresult = self.mycursor.fetchone()
            doc_id = myresult[0]
            print("DOC_ID IS " + str(doc_id))

            #Insert token, doc_id, tf into web_index
            t_sql = "INSERT INTO web.web_index(token, doc_id, tf, wt) values (%s,%s,%s,%s)"

            t_val = []
            for token in result.keys():
                t_val.append(
                    (token, doc_id, result[token][0], result[token][1]))

            #print(t_val)

            self.mycursor.executemany(t_sql, t_val)

            self.mydb.commit()
            print(self.mycursor.rowcount, "was inserted in WEB_INDEX.")

            #insert into TokenT table
            count = 0
            for token in result.keys():
                tq = "Insert ignore into tokenT values (%s)"
                tv = (token, )
                self.mycursor.execute(tq, tv)
                self.mydb.commit()
                count += 1

            print("inserted " + str(count) + " Tokens")
示例#5
0
    def calculate(self, target_paragraph, predicted_paragraph):

        tokenizer = Tokenizer()

        target_words_list = tokenizer.tokenize_to_words_nltk(
            target_paragraph.encode('utf-8'))
        predicted_words_list = tokenizer.tokenize_to_words_nltk(
            predicted_paragraph.encode('utf-8'))

        dictionary = Utils.construct_dictionary(
            [target_words_list, predicted_words_list])
        target_bow, predicted_bow = Utils.retrieve_bag_of_words(
            dictionary, target_words_list, predicted_words_list)
        document_precision, document_recall, document_f1score = Utils.calculate_bow_precision_recall(
            target_bow, predicted_bow, len(target_words_list),
            len(predicted_words_list))
        self.precision_list.append(document_precision)
        key = int(np.floor_divide(document_precision, 10))
        self.precision_count_map[key] = self.precision_count_map.get(key,
                                                                     0) + 1

        self.recall_list.append(document_recall)
        key = int(np.floor_divide(document_recall, 10))
        self.recall_count_map[key] = self.recall_count_map.get(key, 0) + 1

        self.f1score_list.append(document_f1score)
        key = int(np.floor_divide(document_f1score, 10))
        self.f1score_count_map[key] = self.f1score_count_map.get(key, 0) + 1
def get_tokens(jack_file):
    token_generator = Tokenizer(jack_file)
    for token in token_generator:
        if not token:
            continue
        tokens.append(token)
    return tokens
 def run(code, file, path):
     Parser.tokens = Tokenizer(code)
     Parser.file = file
     Parser.writer = MKDocsWriter()
     Parser.path = path
     ret = Parser.parseDocstring()
     return ret
    def parse_state_machine(cls, inputString):
        tokens = Tokenizer(inputString)
        number_of_states = 0
        state_machine_id = 0
        state_machine = StateMachine()

        while (tokens.has_more_tokens()):
            current_token = tokens.peek()

            if (current_token == '--fsm-config'):
                tokens.next_token()
                state_machine_id = tokens.next_token()
            elif (current_token == '--nstates'):
                tokens.next_token()
                number_of_states = tokens.next_token()
            else:
                new_state = State(tokens)
                state_machine.add_state(new_state)

                # Create transitions
                token = tokens.peek()
                while (re.match(r'--n(\d+)x(\d+)', token) != None
                       and re.match(r'--s(\d+)', token) == None):
                    new_transition = Transition(tokens)
                    state_machine.add_transition(new_transition)
                    token = tokens.peek()

        return state_machine
示例#9
0
    def start(self, queue):
        tokenizer = Tokenizer()
        tokenized_text = tokenizer.tokenize_paragraph(self._text)
        sentences = [sent for sent, _ in tokenized_text]
        tokenized_text = [' '.join(sent) for _, sent in tokenized_text]

        vectorizer = self._get_vectorizer()
        dtm = vectorizer.fit_transform(tokenized_text)

        u, sigma, vT = randomized_svd(dtm.T,
                                      n_components=self._n_components,
                                      n_iter=5,
                                      random_state=None)
        sigma = np.diag(sigma)
        scores = self._sentence_selection(vT, sigma)
        scores.sort()
        summary = ' '.join([sentences[i] for i in scores])
        similarity = tokenizer.calculate_similarity(self._text, summary)
        res = {
            "id": self._id,
            "summary": summary,
            "similarity": similarity,
            'status': "done"
        }
        if self._title is not None:
            res['title'] = self._title
        queue.put(res)
示例#10
0
文件: Indexer.py 项目: manthan787/IR
    def index(self, keep_stopwords=True, stem=False):
        print "Indexing %s documents in bulk!" % (len(self.docs))

        inverted_list = {}
        for dID, text in self.docs.iteritems():
            tokenizer = Tokenizer(text)
            tokens = tokenizer.tokenize()
            position = 1
            filtered_tokens = []
            for t in tokens:
                if not keep_stopwords:
                    if t in stop_words:
                        continue

                if stem:
                    t = porter2_stem(t)

                filtered_tokens.append(t)
                mDocID = self.mapper.mapDocID(dID)
                mToken = self.mapper.mapToken(t)
                if mToken not in inverted_list:
                    inverted_list[mToken] = {}

                inverted_list[mToken].setdefault(mDocID, []).append(position)
                position += 1

            self.docLengths[dID] = len(filtered_tokens)

        return self.__writeInvertedIndex(inverted_list)
示例#11
0
def getDocuments2(path):
    # Should find a single file now
    file = glob.glob(path)
    file = open(file[0], "r", encoding="utf8")
    file = file.read()
    file = file.splitlines()
    tokenizer = Tokenizer()

    index = 0
    output = []
    normalizedDoc = []
    for line in file:
        if line == "":
            index += 1
        line = tokenizer.stemQuery(line)
        normalizedDoc += line
        if index % 2 == 0 and index > 1:  # The error is here index thing
            # Instead of returning, we want to normalize this all this and send it somewhere
            # Gonna do lots of computing though
            # Remove the URL from the normalized document

            # If we are at the end of the file normalizedDoc will be empty, so we can just stop
            if normalizedDoc == []:
                return output

            normalizedDoc.pop(0)
            normalizedDoc.pop(0)

            # Add the normlaized doc to the output list
            output.append(normalizedDoc)
            # Reset the normalizedDoc list to prepare for the next document
            normalizedDoc = []
    def get_comm_token(self, count_num):
        tf = pd.DataFrame()
        for name in self.names_of_knn:
            tf = tf.append(
                self.knn_csv.loc[self.knn_csv['image_name'] == name])

            tf.index = range(len(tf))
            token_list = Tokenizer(tf)

            set_list = []
            total_list = []
            comm_token = []

        for t_list in token_list:
            set_list.append(list(set(t_list)))

        for s_list in set_list:
            total_list += s_list

        for token in total_list:
            count = 0
            for s_list in set_list:
                count += s_list.count(token)

            if count >= count_num:
                comm_token.append(token)

        return list(set(comm_token))
示例#13
0
    def __init__(self, text, product_name):
        self.candidate_features = []
        self.feature_sentences = []
        self.product_name = product_name.lower().split('-')[0].split('_')
        t = Tokenizer()
        sents = t.sent_tokenize(text.lower())
        p = POSTagger()
        wnl = WordNetLemmatizer()
        for sent in sents:
            tagged_sent = p.nltk_tag(t.word_tokenize(sent))
            feature_sent = {}
            feature_sent['sentence'] = sent
            feature_sent['tags'] = tagged_sent
            feature_sent['nouns'] = []
            feature_sent['noun_phrases'] = []
            for i in range(0, len(tagged_sent)):
                (word, tag) = tagged_sent[i]
                #Don't include proper nouns
                if tag.startswith('N') and tag != 'NNP':
                    """
					Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase.
					Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is	low. """
                    if i > 0 and len(
                            feature_sent['nouns']
                    ) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][
                            -1] and feature_sent['sentence'].find(
                                feature_sent['nouns'][-1] + ' ' + word) > -1:
                        feature_sent['noun_phrases'].append(
                            wnl.lemmatize(feature_sent['nouns'].pop() + ' ' +
                                          word))
                    else:
                        feature_sent['nouns'].append(wnl.lemmatize(word))

            self.feature_sentences.append(feature_sent)
示例#14
0
 def __init__(self, args, logger=None):
     self.args = args
     self.logger = logger
     self.tokenizer_method = args.tokenizer_method
     self.remove_stopwords = True
     self.vocab_path = os.path.join(args.experiment_folder, "preprocessed/")
     self.tokenizer = None
     self.seed = args.seed
     self.tokenizer_obj = Tokenizer(language='english',
                                    tokenizer_method=self.tokenizer_method,
                                    remove_stopwords=self.remove_stopwords,
                                    ngram_range=(1, 1),
                                    min_freq=1,
                                    max_freq_perc=1.0)
     self.tokenizer = self.tokenizer_obj.tokenizer
     self.vectorizer = TfidfVectorizer(sublinear_tf=True,
                                       min_df=5,
                                       max_df=0.9,
                                       norm='l2',
                                       ngram_range=(1, 2),
                                       analyzer='word',
                                       tokenizer=identity_fn,
                                       preprocessor=identity_fn,
                                       token_pattern=None)
     self.model = LogisticRegression(random_state=self.seed,
                                     max_iter=int(1e6))
     self.finetune = self.train_pseudo
示例#15
0
def test_base():
    fake_file = BytesIO(b"method void dispose();")
    t = Tokenizer(fake_file)

    token = t.advance()
    assert (token.token == 'method')
    assert (token.type == 'keyword')

    token = t.advance()
    assert (token.token == 'void')
    assert (token.type == 'keyword')

    token = t.advance()
    assert (token.token == 'dispose')
    assert (token.type == 'identifier')

    token = t.advance()
    assert (token.token == '(')
    assert (token.type == 'symbol')

    token = t.advance()
    assert (token.token == ')')
    assert (token.type == 'symbol')

    token = t.advance()
    assert (token.token == ';')
    assert (token.type == 'symbol')
示例#16
0
    def check_corpus(self, scrape_from_when, min_word_freq):
        """
        Checks if there are pre-existing files or if they will have to be regenerated. If data needs to be scraped
        the bot will go ahead and do that and immediately generate a corpus for the collected data.
        :type min_word_freq: the minimum number of times a word must appear in the corpus to be in the user's vocab
        :param scrape_from_when: When the bot will start grabbing tweets from
        """
        if min_word_freq < 1:
            raise ValueError(colors.red("Word frequency threshold must be greater than 0"))
        if self.handle in "test":
            return None  # nothing to do here

        scraped = False
        if not os.path.exists(self.corpus):  # check for corpus file
            print colors.red("no corpus.json file found - generating...")
            if not os.path.exists(self.folder):  # check if they even have a folder yet
                os.mkdir(self.folder)
            scrape(self.handle, self.keys, start=scrape_from_when if scrape_from_when else self.get_join_date())
            scraped = True
        if scrape_from_when and not scraped:  # they already had a corpus and need a special scrape
            scrape(self.handle, self.keys, start=scrape_from_when)

        tokenizer = Tokenizer(min_word_freq)
        tokenizer.generate(self.handle)
        return tokenizer  # always return the Tokenizer object
示例#17
0
 def compile_jack(jack_file_name):
     token_file_name = jack_file_name.replace('.jack', 'T.xml')
     token_file = open(token_file_name, 'w')
     jack_file = open(jack_file_name, 'r')
     tokenizer = Tokenizer(jack_file, token_file)
     vm_file = open(jack_file_name.replace('.jack', '') + '.vm', 'w')
     code_writer = CodeWriter(tokenizer, vm_file)
     code_writer.compile_class()
def main():
    tk = Tokenizer('Mytestfor10.jack')
    while(tk.has_more_tokens()):
        tk.advance()
        print(tk.token_type(),tk.identifier())
    ce = CompilationEngine('Mytestfor10.jack')
    root = Element('do')
    ce.compile_if(root)
示例#19
0
 def __init__(self, model_dir):
     '''
     @param model_dir: The directory containing all trained model files
     '''
     self.models = {}
     self.tokenizer = Tokenizer()
     os.path.walk(model_dir, install_all_model, self.models)
     print "All models loaded"
示例#20
0
 def __init__(self, input_content, file=True, debug=False):
     self.tokenizer = Tokenizer(input_content, file)
     self.tokenizer.tokenize()
     self.token = None
     self.is_debug = debug
     self.pos = 0
     self.info = []
     self.debug = []
     self.eps = False
示例#21
0
 def read(self):
     # print('!',sys.argv[0])
     # print(os.path.dirname(__file__)+'/data/ptb.valid.txt')
     file=open(os.path.dirname(__file__)+'/data/ptb.valid.txt')
     lines=file.readlines()
     tokenizer=Tokenizer(9999,oov_token=1)
     tokenizer.fit_on_texts(lines)
     self.seqs=tokenizer.texts_to_sequences(lines)
     return self.seqs
示例#22
0
def main(unused_argv):
  if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
    raise Exception("Problem with flags: %s" % unused_argv)

  tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want
  tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode))

  # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary
  FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name)
  if not os.path.exists(FLAGS.log_root):
    if FLAGS.mode=="train":
      os.makedirs(FLAGS.log_root)
    else:
      raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root))
  print(FLAGS.vocab_size)
  vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary

  # If in decode mode, set batch_size = beam_size
  # Reason: in decode mode, we decode one example at a time.
  # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses.
  if FLAGS.mode == 'decode':
    FLAGS.batch_size = FLAGS.beam_size

  # If single_pass=True, check we're in decode mode
  if FLAGS.single_pass and FLAGS.mode!='decode':
    raise Exception("The single_pass flag should only be True in decode mode")

  # Make a namedtuple hps, containing the values of the hyperparameters that the model needs
  hparam_list = ['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen']
  hps_dict = {}
  for key,val in FLAGS.__flags.items(): # for each flag
    if key in hparam_list: # if it's in the list
      hps_dict[key] = val # add it to the dict
  hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)

  # Create a batcher object that will create minibatches of data
  batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass)

  tf.set_random_seed(111) # a seed value for randomness

  if hps.mode == 'train':
    print("creating model...")
    model = SummarizationModel(hps, vocab)
    setup_training(model, batcher)
  elif hps.mode == 'eval':
    model = SummarizationModel(hps, vocab)
    run_eval(model, batcher, vocab)
  elif hps.mode == 'decode':
    tokenizer = Tokenizer(FLAGS.article_path, FLAGS.data_path)
    decode_model_hps = hps  # This will be the hyperparameters for the decoder model
    decode_model_hps = hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries
    model = SummarizationModel(decode_model_hps, vocab)
    decoder = BeamSearchDecoder(model, batcher, vocab)
    decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once)
  else:
    raise ValueError("The 'mode' flag must be one of train/eval/decode")
def test_compile_class_with_subroutine_description():
    fake_file = BytesIO(b"""
    class X{
        constructor X new(){
            return this;
        }
    
        method void new(){
            return this;
        }
    
        function void main() {
            return;
        }
    }""")
    t = Tokenizer(fake_file)
    c = CompilationEngine(t)
    node = c.compile()

    sd_1 = node.value[3].value
    sd_2 = node.value[4].value
    sd_3 = node.value[5].value

    # Test constructor
    assert (sd_1.value[0].name == 'keyword')
    assert (sd_1.value[0].value == 'constructor')

    assert (sd_1.value[1].name == 'identifier')
    assert (sd_1.value[1].value == 'X')

    assert (sd_1.value[2].name == 'identifier')
    assert (sd_1.value[2].value == 'new')

    assert (sd_1.value[3].name == 'symbol')
    assert (sd_1.value[3].value == '(')

    assert (sd_1.value[4].name == 'parameterList')
    assert (sd_1.value[4].value == [])

    assert (sd_1.value[5].name == 'symbol')
    assert (sd_1.value[5].value == ')')

    assert (sd_1.value[6].name == 'subroutineBody')
    assert (sd_1.value[6].value[0].value == '{')

    # Test method
    assert (sd_2.value[0].name == 'keyword')
    assert (sd_2.value[0].value == 'method')

    # Test function
    assert (sd_3.value[0].name == 'keyword')
    assert (sd_3.value[0].value == 'function')

    # Test that class closes properly
    assert (node.value[6].name == 'symbol')
    assert (node.value[6].value == '}')
    def __init__(self, vocab_dict, estimated_sent_len, estimated_doc_len):
        self.vocab_dict = vocab_dict
        self.estimated_sent_len = estimated_sent_len
        self.estimated_doc_len = estimated_doc_len
        self.tokenizer = Tokenizer()

        self.sentence_processing = SentenceProcessing()
        self.document_processing = DocumentProcessing()
        self.unknown_words_processing = UnknownWordsProcessing(
            vocab_list=vocab_dict.keys(), replace=False)
示例#25
0
    def newcommand(self, name, nargs=0, definition=None, opt=None):
        """ 
        Create a \\newcommand 

        Required Arguments:
        name -- name of the macro to create
        nargs -- integer number of arguments that the macro has
        definition -- string containing the LaTeX definition
        opt -- string containing the LaTeX code to use in the 
            optional argument

        Examples::
            c.newcommand('bold', 1, r'\\textbf{#1}')
            c.newcommand('foo', 2, r'{\\bf #1#2}', opt='myprefix')

        """
        name = str(name)
        # Macro already exists
        if self.has_key(name):
            if not issubclass(self[name],
                              (plasTeX.NewCommand, plasTeX.Definition)):
                if not issubclass(self[name], plasTeX.TheCounter):
                    return
            macrolog.debug('redefining command "%s"', name)

        if nargs is None:
            nargs = 0
        assert isinstance(nargs, int), 'nargs must be an integer'

        if isinstance(definition, basestring):
            definition = [x for x in Tokenizer(definition, self)]

        if isinstance(opt, basestring):
            opt = [x for x in Tokenizer(opt, self)]

        macrolog.debug('creating newcommand %s', name)
        newclass = new.classobj(name, (plasTeX.NewCommand, ), {
            'nargs': nargs,
            'opt': opt,
            'definition': definition
        })

        self.addGlobal(name, newclass)
示例#26
0
    def run(code):

        code = Pre_process.filter(code)
        Parser.tokens = Tokenizer(code)
        resultado = Parser.parseProgram()

        if Parser.tokens.actual.value != 'eof':
            raise Exception("EOF not reached")

        return resultado
示例#27
0
    def run(self):
        # process and compile each class
        for file in self.fileLists:
            filename = file.stem
            fileHandle = open(str(file))
            self.tokenizer = Tokenizer()
            self.parser = Parser(self.parent, filename)
            self.compileCode(fileHandle)

        print('Code successfully compiled')
示例#28
0
class Miner():
    filePath = './data/'
    tokenizer = Tokenizer('portuguese')
    writter = TokenWritter()

    def __init__(self, filename):
        if (filename == ''):
            raise ValueError('filename cannot be empty.')
        self.filePath += path
        self.load()

    def load(self):
        self.writeDataframe(self.generateDataframe())

    def search(self, tags=[], words=[]):
        if (tags == [] and words == []):
            raise ValueError('You must search for a tag or a word, or both.')

        df = next(self.readDataframe())

        if (tags == []):
            return df[df['PALAVRA'].isin(words)]

        if (words == []):
            return df[df['TAG'].isin(tags)]

        wdf = df[df['PALAVRA'].isin(words)]
        return wdf[wdf['TAG'].isin(tags)]

    def generateDataframe(self):
        self.generateMidFile()
        df = pd.read_csv('./minerMid.csv', sep=',', header=0)
        df['count'] = df.groupby(['PALAVRA',
                                  'TAG'])['PALAVRA'].transform('count')
        df.drop_duplicates(inplace=True)
        ''' ALGORITMO QUICKSORT '''
        df.sort_values(by=['count'],
                       ascending=False,
                       inplace=True,
                       kind='quicksort')
        return df

    def writeDataframe(self, df):
        fileDf = open('dataframe.pickle', 'wb')
        pickle.dump(df, fileDf)
        fileDf.close()

    def readDataframe(self):
        file = open('dataframe.pickle', 'rb')
        yield pickle.load(file)
        file.close()

    def generateMidFile(self):
        self.writter.outputTokens(
            self.tokenizer.tokenizeFileWords(self.filePath), 'minerMid.csv')
示例#29
0
def main():
    expression = input("Enter the expression: ")
    expression += " "
    calc = Calculator()
    toke = Tokenizer()
    print("Expression: " + expression)
    print("-----------")
    calc.tokens = toke.Tokenize(expression)
    calc.PrettyPrint(calc.tokens)
    print("-----------")
    print("Expression Result: ",calc.ArithmeticExpression())
示例#30
0
def repl():
    tokenizer = Tokenizer()
    parser = Parser()
    exp = ''

    while exp != 'end':
        exp = input('exp > ')
        tokenizer.clear()
        tokens = tokenizer.tokenize(exp)
        codes = parser.parse(tokens)
        show_codes(codes)