Python tokenize示例，tokenization.tokenize Python示例

示例#1

0

显示文件

def main(file, dictionaryname):

    starttime = time.time()
    dictionary = cd.getDictFromDisk(dictionaryname)
    matrix = initializeTfMatrix(dictionary["0"], len(dictionary))
    tk.tokenize(file, getLines, dictionary, matrix)
    end = time.time() - starttime
    print("Time for TF matrix:", end)
    print("Current size of tf matrix'",
          sys.getsizeof(matrix) / 1000000, "Mbytes")
    print(matrix[0])

示例#2

0

显示文件

    def keywords(self, filename, num_topics=5, keywords_per_topic=3):
        text = ""
        with open(filename) as f:
            for line in f:
                text += line

        words = tokenize(text, "word", return_spans=False)
        sentences = tokenize(text, "sentence", return_spans=False)

        wc = {}
        clean_sentences = []
        for sent in sentences:
            clean_sent = {}
            for word in tokenize(sent, "word", return_spans=False):
                word = self.TF.clean(word)
                clean_sent[word] = 1
                wc[word] = wc.get(word, 0) + 1
            clean_sentences.append(clean_sent)

        matrix = []
        for word in wc.keys():
            row = []
            for sent in clean_sentences:
                if word in sent:
                    row.append(self.TF.weight(word, wc[word]))
                else:
                    row.append(0)
            matrix.append(row)

        matrix = numpy.matrix(matrix)
        U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False)

        D = s * Vh

        keywords = []

        for topic in range(num_topics):
            try:
                words = sorted(enumerate([u for u in U[:, topic]]),
                               key=lambda x: x[1])
            except IndexError:
                print "Problem indexing numpy array for", filename, "on topic", topic
                continue
            added = 0
            word_index = 0
            while added < keywords_per_topic and word_index < len(words):
                #print "Looking at", words[word_index], wc.keys()[words[word_index][0]]
                if wc.keys()[words[word_index][0]] not in keywords:
                    keywords.append(wc.keys()[words[word_index][0]])
                    added += 1
                word_index += 1

        return ", ".join(keywords)

示例#3

0

显示文件

文件： lsa.py 项目： zachwooddoughty/summarize

    def keywords(self, filename, num_topics=5, keywords_per_topic=3):
        text = ""
        with open(filename) as f:
            for line in f:
                text += line

        words = tokenize(text, "word", return_spans=False)
        sentences = tokenize(text, "sentence", return_spans=False)

        wc = {}
        clean_sentences = []
        for sent in sentences:
            clean_sent = {}
            for word in tokenize(sent, "word", return_spans=False):
                word = self.TF.clean(word)
                clean_sent[word] = 1
                wc[word] = wc.get(word, 0) + 1 
            clean_sentences.append(clean_sent)

        matrix = []
        for word in wc.keys():
            row = []
            for sent in clean_sentences:
                if word in sent:
                    row.append(self.TF.weight(word, wc[word]))
                else:
                    row.append(0)
            matrix.append(row)

        matrix = numpy.matrix(matrix)
        U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False)

        D = s * Vh

        keywords = []

        for topic in range(num_topics):
            try:
                words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1])
            except IndexError:
                print "Problem indexing numpy array for", filename, "on topic", topic
                continue
            added = 0
            word_index = 0
            while added < keywords_per_topic and word_index < len(words):
                #print "Looking at", words[word_index], wc.keys()[words[word_index][0]]
                if wc.keys()[words[word_index][0]] not in keywords:
                    keywords.append(wc.keys()[words[word_index][0]])
                    added += 1
                word_index += 1

        return ", ".join(keywords)

示例#4

0

显示文件

文件： nbclassify.py 项目： zhiyanglu/CS544_Natural_Language_Processing

def classify(text):
	global po_tot, ne_tot, tr_tot, de_tot
	po_weight = 0
	ne_weight = 0
	tr_weight = 0
	de_weight = 0

	words = text.split()

	if tokenization is True:
		processed_data = tokenize(words)
	else:
		processed_data = words

	for feature in processed_data:
		
		if feature not in model_features:
			continue
		po_weight += math.log10(float(positive_model[feature])/float(po_tot))
		ne_weight += math.log10(float(negative_model[feature])/float(ne_tot))
		tr_weight += math.log10(float(truth_model[feature])/float(tr_tot))
		de_weight += math.log10(float(decept_model[feature])/float(de_tot))

	d = {po_weight : "positive", ne_weight : "negative", tr_weight : "truthful", de_weight : "deceptive"}
	label_1 = d[max(tr_weight, de_weight)]
	label_2 = d[max(po_weight, ne_weight)]

	return label_1 + " " + label_2

示例#5

0

显示文件

文件： nblearn.py 项目： zhiyanglu/CS544_Natural_Language_Processing

def count(text, c, features):
	global tokenization

	words = text.split()

	if tokenization is True:
		processed_data = tokenize(words)
	else:
		processed_data = words


	for feature in processed_data:

		features_count[c] += 1
		features_count["tot"] += 1

		if feature in all_features:
			all_features[feature] += 1
		else:
			all_features[feature] = 1

		if feature in features:
			features[feature] += 1
		else:
			features[feature] = 1

示例#6

0

显示文件

文件： tokenization_test.py 项目： thomaswhitcomb/lis.py

def run_test():

    tokens = tokenization.tokenize("(defun fn (a) (+ a 42))")
    if tokens != [
            '(', 'defun', 'fn', '(', 'a', ')', '(', '+', 'a', 42, ')', ')'
    ]:
        print "Error on line ", get_location()[0], "in", get_location()[1]

    tokens = tokenization.tokenize("abc \"this is a literal\" def")
    if tokens != ["abc", "this is a literal", "def"]:
        print "Error on line ", get_location()[0], "in", get_location()[1]

    tokens = tokenization.tokenize("  abc 1 53.5 cc(g b)a   'def")
    if tokens != ["abc", 1, 53.5, "cc", "(", "g", "b", ")", "a", "'", "def"]:
        print tokens
        print "Error on line ", get_location()[0], "in", get_location()[1]

示例#7

0

显示文件

    def generate_haiku_by_word(self):
        """Creates a dict with words and the haiku's in which these words
        occur.
        """

        haiku_by_word = defaultdict(list)
        word_set = set([
            word for haiku in self.haiku_list
            for word in tokenization.tokenize(haiku)[:-1]
        ])
        for word in word_set:
            for haiku in self.haiku_list:
                if word in tokenization.tokenize(haiku):
                    haiku_by_word[word].append(haiku)
        with open('haiku_by_word.pickle', 'wb') as f:
            pickle.dump(haiku_by_word, f)

示例#8

0

显示文件

 def get_best_match(self, snippet):
   get_near_dups = self.simhash_index.get_near_dups
   generate_simhash = self.generate_simhash
   title_author_to_count = {}
   
   paras = extract_paragraphs(snippet)
   
   #evenly distribute the corrupted paragraphs
   #shuffle(paras)
   
   #For each paragraph, get the closest matching previously encountered paragraphs.
   #If multiple matches, prune via edit distance.
   #The work of art that matches the most paragraphs is the winner (if it matches enough)
   paras_done = 0
   for para in paras:
     tokens = tokenize(para)
     if not tokens:
       continue
     paras_done += 1
     sh = generate_simhash(tokens)
     candidates = [make_tuple(match) for match in get_near_dups(sh)]
     
     #Increment the count of these works
     for candidate in candidates:
       _, title, author, para_num = candidate
       k = (title, author)
       title_author_to_count[k] = title_author_to_count.get(k, 0) + 1
   
   if title_author_to_count:
     #OK, what work was the most frequent, and what was that frequency?
     (title, author), f = max(title_author_to_count.iteritems(), key=lambda item: item[1])
                 
     score = 1.*f/paras_done
     if score >= 0.1:
       return {'title': title, 'author': author, 
               'score': score, 'author_score': None, 
               'completion': None}
   
   #This is either so corrupt that we can't tell what it is, or is a new work.
   #Guess the author
   tokens = [item for sublist in [tokenize(p) for p in paras] for item in sublist]
   author_guess, author_score = self.author_identifier.predict_author(tokens)
   completion = self.author_semantic_models.complete(author_guess, tokens, self.num_words_to_complete, 1)
   
   return {'title': None, 'author': author_guess, 
           'score': None, 'author_score': author_score, 
           'completion': completion}

示例#9

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_1_mais_2_menos_3(self):
		self.assertEqual([
			('N', '1'),
			('+', '+'),
			('N', '2'),
			('-', '-'),
			('N', '3')
		], tokenize('1+2-3'))

示例#10

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_para_os_operadores_validos(self):
		self.assertEqual([
			('+', '+'),
			('-', '-'),
			('*', '*'),
			('/', '/'),
			(':', ':'),
			('(', '('),
			(')', ')'),
		], tokenize('+-*/:()'))

示例#11

0

显示文件

文件： export.py 项目： texttheater/ccgweb

def filter_and_tokenize(rows, lang):
    """Filters and tokenizes sentence pairs.

    For each row r, r[0] must be the target sentence and r[1] must be the
    source sentence.

    Returns rows that can be tokenized as sentences, with elements
    trg_tokenized and src_tokenized added at the end of each row.
    """
    # Pre-tokenization filters:
    rows = tuple(r for r in rows if is_line(r[0]) and is_line(r[1]))
    rows = tuple(r for r in rows if '\xad' not in r[0] and '\xad' not in r[1])
    # Tokenization:
    trgtok = tokenization.tokenize((r[0] for r in rows), lang)
    srctok = tokenization.tokenize((r[1] for r in rows), 'eng')
    rows = tuple(r + (t, s) for r, t, s in zip(rows, trgtok, srctok))
    # Post-tokenization filters:
    rows = tuple(r for r in rows if r[-2] and r[-1])
    return rows

示例#12

0

显示文件

 def test_para_os_operadores_validos(self):
     self.assertEqual([
         ('+', '+'),
         ('-', '-'),
         ('*', '*'),
         ('/', '/'),
         (':', ':'),
         ('(', '('),
         (')', ')'),
     ], tokenize('+-*/:()'))

示例#13

0

显示文件

文件： parsing.py 项目： kerkeslager/fur

 def test_parses_single_quoted_string_literal(self):
     self.assertEqual(
         _string_literal_expression_parser(
             0, tokenization.tokenize("'Hello, world'")),
         (
             True,
             1,
             FurStringLiteralExpression(string='Hello, world'),
         ),
     )

示例#14

0

显示文件

文件： build_index.py 项目： luckyhusky/hotel_retrieval

def buildInvertedIndex(hotel_data):
    '''This function is used for building the index of address to hotel.

       Index for the whole address include: region, street, city, tostal-code, locality. 
    '''
    name_result = {}
    address_result = {}
    score_result = {}
    for hotel_id in range(13000):#hotel_data:
        hotel_id = str(hotel_id)
        #for name to hotel index
        if hotel_id in hotel_data:
            print hotel_id#, hotel_data[hotel_id]
            l_name = tokenize(hotel_data[hotel_id]["name"])
            for term in l_name:
                if term not in name_result.keys():
                    name_result[term] = []
                if hotel_id not in name_result[term]:
                    name_result[term].append(hotel_id)
            #for address to hotel index
            l_address = []
            address_dic = hotel_data[hotel_id]["address"]
            for sub_address in address_dic:#handle each sub-address of the address: region, locality, street, postal-code
                l_address.extend(tokenize(address_dic[sub_address]))
            for term in l_address:
                if term not in address_result.keys():
                    address_result[term] = []
                if hotel_id not in address_result[term]:
                    address_result[term].append(hotel_id)
            #for scores
            average_scores = {}        
            for review_id in hotel_data[hotel_id]["reviews"]:
                for aspect in hotel_data[hotel_id]["reviews"][review_id]["ratings"]:
                    if aspect not in average_scores:
                        average_scores[aspect] = 0
                    if hotel_data[hotel_id]["reviews"][review_id]["ratings"][aspect] != "":
#                    print type(result[id]["reviews"][review_id]["ratings"][aspect])
                        average_scores[aspect] += float(hotel_data[hotel_id]["reviews"][review_id]["ratings"][aspect])
            for average_score in average_scores:
                average_scores[average_score] /= len(hotel_data[hotel_id]["reviews"])
            score_result[hotel_id] = average_scores
    return (name_result, address_result, score_result)

示例#15

0

显示文件

文件： MCR.py 项目： lukaszarczynski/MCR

    def get_tokenized_line(self):
        line = input("> ").strip()
        if len(line) > 0 and line[0].upper():
            line = line[0].lower() + line[1:]

        line = list(filter(lambda x: x not in self.stopwords, tokenize(line)))
        line = [
            self.morphosyntactic.get_dictionary().get(token, [])
            for token in line
        ]
        return line

示例#16

0

显示文件

def eval(s):
    tokens = tokenization.tokenize(s)
    if DEBUG:
        print("tokens:", tokens)

    tokens, tree = parse.parse(tokens)
    if DEBUG:
        print("tree:", tree)

    tree = parse.quote(tree)
    if DEBUG:
        print("post quote:", tree)
    return [core.evaluate({}, x) for x in tree]

示例#17

0

显示文件

文件： tf_idf.py 项目： zachwooddoughty/summarize

def main():
    TF = TFIDF()
    text = ""
    with open(sys.argv[1]) as f:
        for line in f:
            text += line

    words = tokenize(text, "word", return_spans=False)
    sentences = tokenize(text, "sentence", return_spans=False)

    wc = {}
    for word in words:
        word = TF.clean(word)
        if word is not None:
            wc[word] = wc.get(word, 0) + 1 

    tf_dict = {}
    for k in wc.keys():
        tf_dict[k] = TF.weight(k, wc[k])

    top = sorted(tf_dict.iteritems(), key = operator.itemgetter(1), reverse=True)[:15]
    for (k,v) in top:
        print k, v, TF.weight(k, wc[k], debug=True)

示例#18

0

显示文件

文件： parsing.py 项目： kerkeslager/fur

 def test_parses_function_with_string_literal_argument(self):
     self.assertEqual(
         _function_call_expression_parser(
             0, tokenization.tokenize("print('Hello, world')")),
         (
             True,
             4,
             FurFunctionCallExpression(
                 name='print',
                 arguments=(FurStringLiteralExpression(
                     string='Hello, world'), ),
             ),
         ),
     )

示例#19

0

显示文件

def cleanText(filename):                      #preprocessing of text
    '''
    Tokenize, Remove stopwords and reduce the words to their stem
    :param filename: path of file to be preprocessed
    '''
    global sentence_dictionary,sentences
    readStopWords()
    sentence_dictionary,sentences = tokenize(filename)
    
    size = 0
    for i in range(0, len(sentence_dictionary)):
        size += len(sentence_dictionary[i])
    
    return sentence_dictionary, sentences, size

示例#20

0

显示文件

def main():
    TF = TFIDF()
    text = ""
    with open(sys.argv[1]) as f:
        for line in f:
            text += line

    words = tokenize(text, "word", return_spans=False)
    sentences = tokenize(text, "sentence", return_spans=False)

    wc = {}
    for word in words:
        word = TF.clean(word)
        if word is not None:
            wc[word] = wc.get(word, 0) + 1

    tf_dict = {}
    for k in wc.keys():
        tf_dict[k] = TF.weight(k, wc[k])

    top = sorted(tf_dict.iteritems(), key=operator.itemgetter(1),
                 reverse=True)[:15]
    for (k, v) in top:
        print k, v, TF.weight(k, wc[k], debug=True)

示例#21

0

显示文件

文件： tf_idf.py 项目： lukaszarczynski/iichatbot

    def compute_dialogue_tf(self, text=None):
        if text is not None:
            dialogues = [dialogue.split("\n") for dialogue in text]
        else:
            dialogues = self.get_dialogues_lines()

        self.dialogue_term_frequency = []
        for document_idx, document in enumerate(dialogues):
            for line_idx, line in enumerate(document):
                for term in tokenize(line):
                    term = term.lower()
                    if term.isalnum():
                        self._increase_dialogue_tf(term, line_idx,
                                                   document_idx)
        return self.dialogue_term_frequency

示例#22

0

显示文件

文件： MCR.py 项目： lukaszarczynski/MCR

def create_reverse_index(path_to_documents_collection, morphosyntactic):
    index = defaultdict(lambda: set())

    with open(path_to_documents_collection, 'r', encoding='utf-8') as file:
        print("+++ creating reverse index +++")
        for line_number, line in enumerate(file):
            if line.startswith("#"):
                continue
            line = tokenize(line.split(":")[-1])
            for token in line:
                base_tokens = morphosyntactic.get_dictionary().get(token, [])
                for base_token in base_tokens:
                    index[base_token].add(line_number)
        print("+++ reverse index created +++")
        print(len(index))
    return [index, []]

示例#23

0

显示文件

文件： build_index.py 项目： luckyhusky/hotel_retrieval

def nameToHotel(hotel_data):
    '''This function is used for building the index of hotel name to hotel id.

       Index for build from the name part.
    '''
    result = {}
    for id in hotel_data:
#        print tokenize(hotel_data[id]["name"])
        l = tokenize(hotel_data[id]["name"])
        for term in l:
            if term not in result.keys():
                result[term] = []
            if id not in result[term]:
                result[term].append(id)
#    print result
    return result

示例#24

0

显示文件

文件： sentiment_analysis.py 项目： vishytheswishy/General-Purpose-Sentiment-Analyzer

def predict(vectoriser, model, text):
    # Predict the sentiment
    listD = []
    listD = tokenize(str(text).lower())
    textdata = vectoriser.transform(listD)
    sentiment = model.predict(textdata)
    # Make a list of text with sentiment.
    data = []
    for text, pred in zip(text, sentiment):
        data.append((text, pred))

    # Convert the list into a Pandas DataFrame.
    df = pd.DataFrame(data, columns=['text', 'sentiment'])
    df = df.replace([0, 1, 2], ["Negative", "Neutral", "Positive"])
    print(df.sentiment)
    return df

示例#25

0

显示文件

文件： dialogue_load.py 项目： lukaszarczynski/MCR

def load_dialogues_from_file(document_path,
                             *,
                             do_tokenization=True,
                             remove_authors=False):
    with open(document_path) as file:
        lines = file.readlines()
        dialogues_list = [line for line in lines if not line.startswith("#")]
        dialogues_list = "".join(dialogues_list)
        dialogues_list = dialogues_list.split("\n\n")
        if remove_authors:
            dialogues_list = [
                "\n".join(
                    [line.split(":")[-1] for line in dialogue.split("\n")])
                for dialogue in dialogues_list
            ]
        if do_tokenization:
            dialogues_list = [tokenize(line) for line in dialogues_list]
    return dialogues_list

示例#26

0

显示文件

 def add(self, doc, title, author):
   add_to_index = self.simhash_index.add
   
   #Index each paragraph in the document into the simhash index
   paras = extract_paragraphs(doc)
   
   #Update the word shape language model for this author
   para_toks = [tokenize(p) for p in paras]
   flat_tokens = [item for sublist in para_toks for item in sublist]
   self.author_semantic_models.add_doc(flat_tokens, author)
   
   #Update the semantic model for this author
   self.author_identifier.add_doc(flat_tokens, author)
   
   #Add each paragraph to the simhash index
   for para_num, tokens in enumerate(para_toks, 1):
     if not tokens:
       continue
     sh = self.generate_simhash(tokens)
     self.simhash_index.add((tokens, title, author, para_num), sh)

示例#27

0

显示文件

文件： app.py 项目： dlbas/market-platform

def emulate():
    """
    Listens for incoming POST request with emulation parameters
    :return:
    """
    # TODO: this
    data = json.loads(request.data)
    number_of_token_bags = tokenize(PD=data.get('PD'),
                                    LGD=data.get('LGD'),
                                    credit_value=data.get('creditSum', 100),
                                    number_of_credits=data.get('creditsCount'))

    with open(settings.LOCK_FILE_NAME, 'w') as lockfile:
        if has_flock(lockfile):
            logger.warning('Could not acquire lock.')
            return Response(status=503)
    global process
    if process is not None:
        process.join()  # to avoid zombie process
    emulation_uuid = uuid.uuid4()

    redis.set(str(emulation_uuid) + '__token_bags', number_of_token_bags)

    process = Process(target=run_emulation,
                      kwargs=dict(url=settings.API_URL,
                                  emulation_uuid=emulation_uuid,
                                  assets=number_of_token_bags,
                                  meanmoney=data.get('meanmoney', 800),
                                  days=data.get('days'),
                                  yearreturn=data.get('placementRate'),
                                  meantargetreturn=data.get('placementRate'),
                                  nplaysers=data.get('peopleCount', 10)))
    process.start()
    return Response(json.dumps(
        {'result': {
            'emulation_uuid': str(emulation_uuid)
        }}),
                    status=200,
                    content_type='application/json')

示例#28

0

显示文件

def run_extractor(doc_set, num_of_topics, num_of_words):
    raw = []

    if type(doc_set) == str:
        doc_set = [doc_set]

    for doc in doc_set:

        # odd scenario where our document text is given in a single element list
        if type(doc) == list:
            doc = doc[0]
        raw_doc = doc.lower()

        # convert to tokens
        tokens = tokenize(raw_doc)

        # remove stop words from tokens
        stopped_tokens = remove_stops(tokens)

        # stem token
        stemmed_tokens = stem_words(stopped_tokens)

        # add to our list
        raw.append(stemmed_tokens)

    dictionary = corpora.Dictionary(raw)

    # convert to bag of words
    # tuples where tuple[0] is word key and tuple[1] is word occurence in given document
    corpus = [dictionary.doc2bow(text) for text in raw]

    ldamodel = models.ldamodel.LdaModel(corpus,
                                        num_topics=num_of_topics,
                                        id2word=dictionary,
                                        passes=20)

    return ldamodel.print_topics(num_topics=num_of_topics,
                                 num_words=num_of_words)

示例#29

0

显示文件

    def construct(self):
        corpus = {}

        # Check to see if we should simply load a pickle
        if os.path.isfile(self.pickle_docs):
            with open(self.pickle_docs) as docs_file:
                current_doclist = pickle.load(docs_file)
                if os.listdir('articles/') == current_doclist:
                    # current article list is the same as pickled article list
                    # so we want to just load the stored pickled corpus data
                    with open(self.pickle_corpus) as corpus_file:
                        self.words = pickle.load(corpus_file)
                        self.n = len(current_doclist)
                        return

        # If we don't load a pickle, build the corpus from articles/ dir
        num_docs = 0.0
        for file_name in os.listdir('articles/'):
            num_docs += 1
            doc = {}
            with open("articles/" + file_name) as article:
                for line in article:
                    for word in tokenize(line, "word", return_spans=False):
                        word = self.clean(word)
                        doc[word] = 1
            for key in doc.keys():
                corpus[key] = corpus.get(key, 0) + 1

        self.words = corpus
        self.n = num_docs

        print "Pickling a new TFIDF corpus"
        # pickle corpus and document list
        with open(self.pickle_docs, "w") as docs_file:
            pickle.dump(os.listdir('articles/'), docs_file)
        with open(self.pickle_corpus, "w") as corpus_file:
            pickle.dump(self.words, corpus_file)

示例#30

0

显示文件

文件： tf_idf.py 项目： zachwooddoughty/summarize

    def construct(self):
        corpus = {}

        # Check to see if we should simply load a pickle
        if os.path.isfile(self.pickle_docs):
            with open(self.pickle_docs) as docs_file:
                current_doclist = pickle.load(docs_file)
                if os.listdir('articles/') == current_doclist:
                    # current article list is the same as pickled article list
                    # so we want to just load the stored pickled corpus data
                    with open(self.pickle_corpus) as corpus_file:
                        self.words = pickle.load(corpus_file)
                        self.n = len(current_doclist)
                        return
        
        # If we don't load a pickle, build the corpus from articles/ dir
        num_docs = 0.0
        for file_name in os.listdir('articles/'):
            num_docs += 1
            doc = {}
            with open("articles/" + file_name) as article:
                for line in article:
                    for word in tokenize(line, "word", return_spans=False):
                        word = self.clean(word)
                        doc[word] = 1
            for key in doc.keys():
                corpus[key] = corpus.get(key, 0) + 1

        self.words = corpus
        self.n = num_docs

        print "Pickling a new TFIDF corpus"
        # pickle corpus and document list
        with open(self.pickle_docs, "w") as docs_file:
            pickle.dump(os.listdir('articles/'), docs_file)
        with open(self.pickle_corpus, "w") as corpus_file:
            pickle.dump(self.words, corpus_file)

示例#31

0

显示文件

文件： utils.py 项目： AWNystrom/MatchingSnippets

def corrupt_book(doc, prob_para, prob_tok, prob_mutate):
  """
  params:
    prob_para : the probability a paragraph will be chosen to be corrupted
    prob_tok : the probability a token within a corrupted paragraph will be chosen for
                corruption
    prob_mutate : the probability the corruption will be a mutation. 1 minus this prob
                  is the probability for deletion
  """
  paras = extract_paragraphs(doc)
  paras = [tokenize(para) for para in paras]
  num_to_corrupt = int(round(len(paras)*prob_para))
  if not num_to_corrupt:
    return doc
    
  inds_to_corrupt = choice(range(len(paras)), num_to_corrupt)
  
  for i in inds_to_corrupt:
    tokens = paras[i]
    num_toks_to_corrupt = int(round(len(tokens)*prob_tok))
    if not num_toks_to_corrupt:
      continue
    tok_inds_to_corrupt = choice(range(len(tokens)), num_toks_to_corrupt)
    for i in tok_inds_to_corrupt:      
      #Should we mutate it, or remove it?
      if random() > prob_mutate:
        #remove it
        tokens[i] = u''
      else:
        #mutate it
        tokens[i] = u''.join(choice(chars_list, len(tokens[i])))
  
  #collapse it all back down into a unicode document
  paras = [u' '.join(para) for para in paras]
  doc = u'\n\n'.join(paras)
  return doc

示例#32

0

显示文件

文件： heap_interface.py 项目： MakeSchool-17/twitter-bot-python-lesliekimm

    def __init__(self):
        # get a file to read words from user
        input_file = input('Please enter a file URL: ')

        SOURCE_TEXT = open(input_file)          # open file

        self.counts = Heap()                    # initialize Heap object
        tokens = tokenize(SOURCE_TEXT)          # create tokens from file
        hash_table = HashTable()                # create hash table

        # insert tokens into hash table
        for i in range(len(tokens)):
            hash_table.insert(tokens[i])

        # insert key value pairs into Heap
        for index in range(hash_table.num_of_buckets):
            if hash_table.buckets[index].is_empty():
                continue
            else:
                temp = hash_table.buckets[index].head
                while temp is not None:
                    pair = (temp.data[0], temp.data[1])
                    self.counts.insert(pair)
                    temp = temp.next

示例#33

0

显示文件

文件： utils.py 项目： pombredanne/MatchingSnippets

def corrupt_book(doc, prob_para, prob_tok, prob_mutate):
    """
  params:
    prob_para : the probability a paragraph will be chosen to be corrupted
    prob_tok : the probability a token within a corrupted paragraph will be chosen for
                corruption
    prob_mutate : the probability the corruption will be a mutation. 1 minus this prob
                  is the probability for deletion
  """
    paras = extract_paragraphs(doc)
    paras = [tokenize(para) for para in paras]
    num_to_corrupt = int(round(len(paras) * prob_para))
    if not num_to_corrupt:
        return doc

    inds_to_corrupt = choice(range(len(paras)), num_to_corrupt)

    for i in inds_to_corrupt:
        tokens = paras[i]
        num_toks_to_corrupt = int(round(len(tokens) * prob_tok))
        if not num_toks_to_corrupt:
            continue
        tok_inds_to_corrupt = choice(range(len(tokens)), num_toks_to_corrupt)
        for i in tok_inds_to_corrupt:
            #Should we mutate it, or remove it?
            if random() > prob_mutate:
                #remove it
                tokens[i] = u''
            else:
                #mutate it
                tokens[i] = u''.join(choice(chars_list, len(tokens[i])))

    #collapse it all back down into a unicode document
    paras = [u' '.join(para) for para in paras]
    doc = u'\n\n'.join(paras)
    return doc

示例#34

0

显示文件

文件： finetune.py 项目： piyushbhuwalka/kaggle-google-quest

    os.makedirs(checkpoint_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    main_logger = init_logger(log_dir, f'finetune_main_{model_name}.log')

    # Import data
    test = pd.read_csv(f'{args.data_dir}test.csv')
    train = pd.read_csv(f'{args.data_dir}train.csv')

    # Min Max scale target after rank transformation
    for col in TARGETS:
        train[col] = train[col].rank(method="average")
    train[TARGETS] = MinMaxScaler().fit_transform(train[TARGETS])
    y = train[TARGETS].values

    # Get model inputs
    ids_train, seg_ids_train = tokenize(
        train, pretrained_model_str=pretrained_models[model_name])
    cat_features_train, _ = get_ohe_categorical_features(
        train, test, 'category')

    # Set training parameters
    device = 'cuda'
    num_workers = 10
    n_folds = 10
    lr = 1e-5
    n_epochs = 10
    bs = 2
    grad_accum = 4
    weight_decay = 0.01
    loss_fn = nn.BCEWithLogitsLoss()

    # Start training

示例#35

0

显示文件

文件： replacing.py 项目： lukaszarczynski/morphis

                 grammar_category.Number.SINGULAR: {
                     grammar_category.Case.GENITIVE: 'mamuta',
                     grammar_category.Case.VOCATIVE: 'mamucie',
                     grammar_category.Case.NOMINATIVE: 'mamut',
                     grammar_category.Case.ACCUSATIVE: 'mamuta',
                     grammar_category.Case.LOCATIVE: 'mamucie',
                     grammar_category.Case.DATIVE: 'mamutowi',
                     grammar_category.Case.INSTRUMENTAL: 'mamutem'},
                 grammar_category.Number.PLURAL: {
                     grammar_category.Case.GENITIVE: 'mamutów',
                     grammar_category.Case.VOCATIVE: 'mamuty',
                     grammar_category.Case.NOMINATIVE: 'mamuty',
                     grammar_category.Case.ACCUSATIVE: 'mamuty',
                     grammar_category.Case.LOCATIVE: 'mamutach',
                     grammar_category.Case.DATIVE: 'mamutom',
                     grammar_category.Case.INSTRUMENTAL: 'mamutami'}},
             grammar_category.Gender.MASCULINE_INANIMATE,
             1.)]
    pasta = tokenization.tokenize(
        "Mój stary to fanatyk wędkarstwa. Pół mieszkania zajebane wędkami najgorsze. Średnio raz w miesiącu ktoś "
        "wdepnie w leżący na ziemi haczyk czy kotwicę i trzeba wyciągać w szpitalu bo mają zadziory na końcu. W "
        "swoim 22 letnim życiu już z 10 razy byłem na takim zabiegu. Tydzień temu poszedłem na jakieś losowe "
        "badania to baba z recepcji jak mnie tylko zobaczyła to kazała buta ściągać xD bo myślała, że "
        "znowu hak w nodze.")
    morph = morphosyntactic.Morphosyntactic("polimorfologik-2.1.txt")
    morph.create_morphosyntactic_dictionary()
    replacer = Replacing(pasta, words, morph)
    assert "raz" in replacer.ignored_words
    assert "możliwość" in replacer.ignored_words
    print("".join(replacer.replace()))

示例#36

0

显示文件

文件： main.py 项目： kerkeslager/fur

import conversion
import crossplatform_ir_generation
import desugaring
import c_generation
import normalization
import optimization
import parsing
import tokenization

source_path = sys.argv[1]

with open(source_path, 'r') as f:
    source = f.read()

tokens = tokenization.tokenize(source)
parsed = parsing.parse(tokens)
desugared = desugaring.desugar(parsed)
normalized = normalization.normalize(desugared)
converted = conversion.convert(normalized)

crossplatform_ir = crossplatform_ir_generation.generate(converted)
optimized = optimization.optimize(crossplatform_ir)
outputted = crossplatform_ir_generation.output(optimized)
print(outputted)

generated = c_generation.generate(optimized)

assert source_path.endswith('.fur')
destination_path = source_path + '.c'

示例#37

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_2_mais_2_menos_1(self):
		self.assertEqual([2,2,'+',1,'-'], parse(tokenize('2+2-1')))

示例#38

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_1_mais_1(self):
		self.assertEqual([1,1,'+'], parse(tokenize('1+1')))

示例#39

0

显示文件

    def summarize(self, filename):
        text = ""
        with open(filename) as f:
            for line in f:
                text += line

        words = tokenize(text, "word", return_spans=False)
        sentences = tokenize(text, "sentence", return_spans=False)

        wc = {}
        clean_sentences = []
        for sent in sentences:
            clean_sent = {}
            for word in tokenize(sent, "word", return_spans=False):
                word = self.TF.clean(word)
                clean_sent[word] = 1
                wc[word] = wc.get(word, 0) + 1
            clean_sentences.append(clean_sent)

        matrix = []
        for word in wc.keys():
            #print "adding", word
            row = []
            for sent in clean_sentences:
                if word in sent:
                    row.append(self.TF.weight(word, wc[word]))
                else:
                    row.append(0)
            matrix.append(row)

        matrix = numpy.matrix(matrix)
        #print "matrix", matrix
        U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False)

        #    print "U", U
        #    print "s", s
        #    print "Vh", Vh
        #
        D = s * Vh
        #print "D", D

        num_sentences = 5
        summary_sentence_indices = []

        #for topic in range(3):
        #    print "Topic", topic
        #    sent_weights = D[topic,:]
        #    #top_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1], reverse=True)[:5]
        #    bottom_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1])[:5]
        #    #print "TOP:", ", ".join(wc.keys()[x[0]] for x in top_words)
        #    print "BOTTOM WORDS:", ", ".join(wc.keys()[x[0]] for x in bottom_words)
        #    top_sents = sorted(enumerate([s for s in sent_weights]), key = lambda x: x[1]) [:3]
        #    print "TOP SENTS:", "\n".join([sentences[s[0]] for s in top_sents])

        topic = 0
        while len(summary_sentence_indices) < num_sentences:

            sent_weights = D[topic, :]
            top_sents = sorted(enumerate([s for s in sent_weights]),
                               key=lambda x: x[1])
            for sent in top_sents:
                if sent[0] > 0 and sent[0] not in summary_sentence_indices:
                    summary_sentence_indices.append(sent[0])
                    break

            topic += 1

        summary = ""
        summary_sentence_indices.sort()
        for i in summary_sentence_indices:
            summary += sentences[i] + "\n"
        return summary

示例#40

0

显示文件

文件： lsa.py 项目： zachwooddoughty/summarize

    def summarize(self, filename):
        text = ""
        with open(filename) as f:
            for line in f:
                text += line

        words = tokenize(text, "word", return_spans=False)
        sentences = tokenize(text, "sentence", return_spans=False)

        wc = {}
        clean_sentences = []
        for sent in sentences:
            clean_sent = {}
            for word in tokenize(sent, "word", return_spans=False):
                word = self.TF.clean(word)
                clean_sent[word] = 1
                wc[word] = wc.get(word, 0) + 1 
            clean_sentences.append(clean_sent)

        matrix = []
        for word in wc.keys():
            #print "adding", word
            row = []
            for sent in clean_sentences:
                if word in sent:
                    row.append(self.TF.weight(word, wc[word]))
                else:
                    row.append(0)
            matrix.append(row)

        matrix = numpy.matrix(matrix)
        #print "matrix", matrix
        U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False)

    #    print "U", U
    #    print "s", s
    #    print "Vh", Vh
    #
        D = s * Vh
        #print "D", D

        num_sentences = 5
        summary_sentence_indices = []

        #for topic in range(3):
        #    print "Topic", topic
        #    sent_weights = D[topic,:]
        #    #top_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1], reverse=True)[:5]
        #    bottom_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1])[:5]
        #    #print "TOP:", ", ".join(wc.keys()[x[0]] for x in top_words)
        #    print "BOTTOM WORDS:", ", ".join(wc.keys()[x[0]] for x in bottom_words)
        #    top_sents = sorted(enumerate([s for s in sent_weights]), key = lambda x: x[1]) [:3]
        #    print "TOP SENTS:", "\n".join([sentences[s[0]] for s in top_sents])

        topic = 0
        while len(summary_sentence_indices) < num_sentences:
            
            sent_weights = D[topic,:]
            top_sents = sorted(enumerate([s for s in sent_weights]), key = lambda x: x[1]) 
            for sent in top_sents:
                if sent[0] > 0 and sent[0] not in summary_sentence_indices:
                    summary_sentence_indices.append(sent[0])
                    break

            topic += 1
            
        summary = ""
        summary_sentence_indices.sort()
        for i in summary_sentence_indices:
            summary += sentences[i] + "\n"
        return summary

示例#41

0

显示文件

文件： nn_trainer.py 项目： zachwooddoughty/summarize

    def build_dataset(self):

        if os.path.isfile(self.dataset_file):
            with open(self.dataset_file, "rb") as f:
                dataset = cPickle.load(f)
        else:
            dataset = SupervisedDataSet(len(features), 1)

        if os.path.isfile(self.done_articles_file):
            with open(self.done_articles_file, "rb") as f:
                done_articles = cPickle.load(f)
        else:
            done_articles = {}

        value = -1
        decision = "y"

        for file_name in os.listdir(self.articles_dir):
            print "\n\n"
            print "---" * 10
            decision = raw_input("Do another article? [y/n] ")
            if decision[0].lower() != "y":
                break

            with open("articles/" + file_name) as article:
                text = ""
                first = True
                for line in article.readlines()[1:]:
                    text += line
                sentences = tokenize(text, "sentence", return_spans=False)

                article_position = done_articles.get(file_name, 0)
                if article_position >= len(sentences):
                    continue

                print "Looking at:", file_name, "from position", article_position

                for sentence in sentences[article_position:]:
                    extractor = FeatureExtractor(sentence)
                    vectors = extractor.get_feature_vectors(
                        features, "sentence")[0]
                    print sentence

                    value = -1
                    while value == -1:
                        rating = raw_input("nothing=OK, space=bad, q=quit: ")
                        if rating == "":
                            value = [0]
                        elif rating[:1].lower() == "q":
                            value = None
                        elif rating[:1] == " ":
                            value = [1]

                    # quit on q
                    if value == None:
                        break

                    dataset.appendLinked(vectors, value)
                    done_articles[file_name] = done_articles.get(file_name,
                                                                 0) + 1

        with open(self.dataset_file, "wb") as f:
            cPickle.dump(dataset, f)
        with open(self.done_articles_file, "wb") as f:
            cPickle.dump(done_articles, f)

示例#42

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_operador_desconhecido(self):
		with self.assertRaises(UnknownTokenException) as e:
			tokenize('1,1')

		self.assertEqual(e.exception.message, 1)

示例#43

0

显示文件

文件： nn_trainer.py 项目： zachwooddoughty/summarize

    def build_dataset(self):

        if os.path.isfile(self.dataset_file):
            with open(self.dataset_file, "rb") as f:
                dataset = cPickle.load(f)
        else:
            dataset = SupervisedDataSet(len(features), 1)

        if os.path.isfile(self.done_articles_file):
            with open(self.done_articles_file, "rb") as f:
                done_articles = cPickle.load(f)
        else:
            done_articles = {}

        value = -1
        decision = "y"

        for file_name in os.listdir(self.articles_dir):
            print "\n\n"
            print "---"*10
            decision = raw_input("Do another article? [y/n] ")
            if decision[0].lower() != "y":
                break

            with open("articles/" + file_name) as article:
                text = ""
                first = True
                for line in article.readlines()[1:]:
                    text += line
                sentences = tokenize(text, "sentence", return_spans=False)

                article_position = done_articles.get(file_name, 0) 
                if article_position >= len(sentences):
                    continue

                print "Looking at:", file_name, "from position", article_position
                
                for sentence in sentences[article_position:]:
                    extractor = FeatureExtractor(sentence)
                    vectors = extractor.get_feature_vectors(features, "sentence")[0]
                    print sentence

                    value = -1
                    while value == -1:
                        rating = raw_input("nothing=OK, space=bad, q=quit: ")
                        if rating == "":
                            value = [0]
                        elif rating[:1].lower() == "q":
                            value = None
                        elif rating[:1] == " ":
                            value = [1]

                    # quit on q
                    if value == None:
                        break
                    
                    dataset.appendLinked(vectors, value)
                    done_articles[file_name] = done_articles.get(file_name, 0) + 1

        with open(self.dataset_file, "wb") as f:
            cPickle.dump(dataset, f)
        with open(self.done_articles_file, "wb") as f:
            cPickle.dump(done_articles, f)

示例#44

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_2(self):
		self.assertEqual([('N', '2')], tokenize('2'))

示例#45

0

显示文件

文件： wordart.py 项目： shenyi1028/word-cloud

# coding:utf-8
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time

import tokenization

txt_file = "raw.txt"
input_txt = tokenization.tokenize(txt_file)
# input_txt = "哈哈哈"

driver = webdriver.Safari()
driver.maximize_window()
driver.get(r'https://wordart.com/create')
wait = WebDriverWait(driver, 10)

time.sleep(1)  # Problem here

wait.until(
    EC.presence_of_element_located(
        (By.XPATH, "//*[@id='root']/div/div[2]/div[1]/div/div[3]")))
fonts = driver.find_element_by_xpath(
    "//*[@id='root']/div/div[2]/div[1]/div/div[3]")
fonts.click()

# wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='root']/div/div[2]/div[1]/div/div[3]/div[2]/div/ul/li[38]/div[1]/img")))

filter = driver.find_element_by_xpath(

示例#46

0

显示文件

                                         for term in
                                         set.intersection(set(self.term_frequency[document_idx].keys()),
                                                          set(idf.keys()))}
        return self.tf_idf


if __name__ == "__main__":
    text = ["Ktoś:Witajcie, ludzie!",
            "Ktoś:Przybywamy do Was w pokoju i dobrej woli!",
            "Ktoś:Robot(człowiek) nie może skrzywdzić człowieka, ani przez zaniechanie dopuścić, by doznał on (człowiek) krzywdy.",
            "Ktoś:Roboty widziały rzeczy, o których Wam, ludziom, się nie śniło.",
            "Ktoś:Roboty to Twoi plastikowi kumple, z którymi fajnie jest przebywać.",
            "Ktoś:Roboty mają lśniące, metalowe tyłki, których nie należy gryźć.",
            "Ktoś:I mają \nKtoś inny:plan."]

    text = [tokenize(phrase) for phrase in text]

    morph = Morphosyntactic("data/polimorfologik-2.1.txt")
    tfidf = TF_IDF("data/tf_idf_test", morph)
    tfidf.compute(text)

    print("term_frequency", tfidf.term_frequency)
    print("document_frequency", tfidf.document_frequency)
    print("tf_idf", tfidf.tf_idf)

    saved_path = tfidf.save()
    del tfidf
    tf_idf = TF_IDF("data/tf_idf_test", morph).load()
    print(tf_idf)

    tf_idf = TF_IDF("data/drama_quotes_test.txt", morph)

示例#47

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_1(self):
		self.assertEqual([('N', '1')], tokenize('1'))

示例#48

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_menos(self):
		self.assertEqual([('-', '-')], tokenize('-'))

示例#49

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_mais(self):
		self.assertEqual([('+', '+')], tokenize('+'))

示例#50

0

显示文件

文件： annotation.py 项目： vp1274/twitter

dataset = args.job_offer_file
output_file = args.output_file


def main():
    
    # Read Random top 350K tweets
    top_tweets = pq.read_table(dataset).to_pandas()

    l = list(range(0,len(top_tweets)))

    # Select num_tweets randomly
    random_tweet_set = random.choices(l, k = num_tweets)

    df = pd.DataFrame(top_tweets,index = random_tweet_set)

    csv_file = open(output_file, mode='w')

    fieldnames = ['Tweet_ID', 'Text','Token', 'ORG', 'LOC', 'JOB_TITLE','Sector']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    for id in df.itertuples(index=False):
            index, text = id[0], id[2]
            tweet_tokens = tokenization.tokenize(text)
            for token in tweet_tokens:
                    writer.writerow({'Tweet_ID': index, 'Text': text, 'Token':token, 'ORG':'', 'LOC':'', 'JOB_TITLE':'','Sector':''})


if __name__ == "__main__":
    main()

示例#51

0

显示文件

文件： graph.py 项目： MakeSchool-17/twitter-bot-python-choncou


class Node:
    def __init__(self, key, did_occure):
        self.node_key = key
        self.node_next = None
        self.node_val = 0
        self.occured = did_occure
        self.future_list = []
        self.count = 0


if __name__ == '__main__':
    token_list = []
    if(len(sys.argv)) > 1:
        token_list = tokenization.tokenize(sys.argv[1])
    else:
        filename = input("Enter Filename: ")
        token_list = tokenization.tokenize(filename)

    my_graph = Markov_Chain()
    for i, k in enumerate(token_list):
        if my_graph.get(k) is None:
            my_graph.set(k, True)
        else:
            my_graph.get(k).occured = True
            my_graph.get(k).node_val += 1
        if i < len(token_list) - 1:
            my_graph.update(k, token_list[i+1])

    histo = sample.stochastic(my_graph)

示例#52

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_11_mais_11(self):
		self.assertEqual([
			('N', '11'),
			('+', '+'),
			('N', '11')
		], tokenize('11+11'))

示例#53

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_2(self):
		self.assertEqual([2], parse(tokenize('2')))

示例#54

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_1_espaco_1(self):
		self.assertEqual([
			('N', '1'),
			('N', '1')
		], tokenize('1 1'))

示例#55

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_2_mais_2_vezes_1(self):
		self.assertEqual([2,2,1,'*','+'], parse(tokenize('2+2*1')))

示例#56

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_1_mais_1_com_espacos(self):
		self.assertEqual([
			('N', '1'),
			('+', '+'),
			('N', '1')
		], tokenize('1 + 1'))

示例#57

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_2_mais_2(self):
		self.assertEqual([2,2,'+'], parse(tokenize('2+2')))

示例#58

0

显示文件

文件： test_tokenization.py 项目： curtocircuito/dojo-centro

	def test_1(self):
		self.assertEqual([1], parse(tokenize('1')))