def main(file, dictionaryname): starttime = time.time() dictionary = cd.getDictFromDisk(dictionaryname) matrix = initializeTfMatrix(dictionary["0"], len(dictionary)) tk.tokenize(file, getLines, dictionary, matrix) end = time.time() - starttime print("Time for TF matrix:", end) print("Current size of tf matrix'", sys.getsizeof(matrix) / 1000000, "Mbytes") print(matrix[0])
def keywords(self, filename, num_topics=5, keywords_per_topic=3): text = "" with open(filename) as f: for line in f: text += line words = tokenize(text, "word", return_spans=False) sentences = tokenize(text, "sentence", return_spans=False) wc = {} clean_sentences = [] for sent in sentences: clean_sent = {} for word in tokenize(sent, "word", return_spans=False): word = self.TF.clean(word) clean_sent[word] = 1 wc[word] = wc.get(word, 0) + 1 clean_sentences.append(clean_sent) matrix = [] for word in wc.keys(): row = [] for sent in clean_sentences: if word in sent: row.append(self.TF.weight(word, wc[word])) else: row.append(0) matrix.append(row) matrix = numpy.matrix(matrix) U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False) D = s * Vh keywords = [] for topic in range(num_topics): try: words = sorted(enumerate([u for u in U[:, topic]]), key=lambda x: x[1]) except IndexError: print "Problem indexing numpy array for", filename, "on topic", topic continue added = 0 word_index = 0 while added < keywords_per_topic and word_index < len(words): #print "Looking at", words[word_index], wc.keys()[words[word_index][0]] if wc.keys()[words[word_index][0]] not in keywords: keywords.append(wc.keys()[words[word_index][0]]) added += 1 word_index += 1 return ", ".join(keywords)
def keywords(self, filename, num_topics=5, keywords_per_topic=3): text = "" with open(filename) as f: for line in f: text += line words = tokenize(text, "word", return_spans=False) sentences = tokenize(text, "sentence", return_spans=False) wc = {} clean_sentences = [] for sent in sentences: clean_sent = {} for word in tokenize(sent, "word", return_spans=False): word = self.TF.clean(word) clean_sent[word] = 1 wc[word] = wc.get(word, 0) + 1 clean_sentences.append(clean_sent) matrix = [] for word in wc.keys(): row = [] for sent in clean_sentences: if word in sent: row.append(self.TF.weight(word, wc[word])) else: row.append(0) matrix.append(row) matrix = numpy.matrix(matrix) U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False) D = s * Vh keywords = [] for topic in range(num_topics): try: words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1]) except IndexError: print "Problem indexing numpy array for", filename, "on topic", topic continue added = 0 word_index = 0 while added < keywords_per_topic and word_index < len(words): #print "Looking at", words[word_index], wc.keys()[words[word_index][0]] if wc.keys()[words[word_index][0]] not in keywords: keywords.append(wc.keys()[words[word_index][0]]) added += 1 word_index += 1 return ", ".join(keywords)
def classify(text): global po_tot, ne_tot, tr_tot, de_tot po_weight = 0 ne_weight = 0 tr_weight = 0 de_weight = 0 words = text.split() if tokenization is True: processed_data = tokenize(words) else: processed_data = words for feature in processed_data: if feature not in model_features: continue po_weight += math.log10(float(positive_model[feature])/float(po_tot)) ne_weight += math.log10(float(negative_model[feature])/float(ne_tot)) tr_weight += math.log10(float(truth_model[feature])/float(tr_tot)) de_weight += math.log10(float(decept_model[feature])/float(de_tot)) d = {po_weight : "positive", ne_weight : "negative", tr_weight : "truthful", de_weight : "deceptive"} label_1 = d[max(tr_weight, de_weight)] label_2 = d[max(po_weight, ne_weight)] return label_1 + " " + label_2
def count(text, c, features): global tokenization words = text.split() if tokenization is True: processed_data = tokenize(words) else: processed_data = words for feature in processed_data: features_count[c] += 1 features_count["tot"] += 1 if feature in all_features: all_features[feature] += 1 else: all_features[feature] = 1 if feature in features: features[feature] += 1 else: features[feature] = 1
def run_test(): tokens = tokenization.tokenize("(defun fn (a) (+ a 42))") if tokens != [ '(', 'defun', 'fn', '(', 'a', ')', '(', '+', 'a', 42, ')', ')' ]: print "Error on line ", get_location()[0], "in", get_location()[1] tokens = tokenization.tokenize("abc \"this is a literal\" def") if tokens != ["abc", "this is a literal", "def"]: print "Error on line ", get_location()[0], "in", get_location()[1] tokens = tokenization.tokenize(" abc 1 53.5 cc(g b)a 'def") if tokens != ["abc", 1, 53.5, "cc", "(", "g", "b", ")", "a", "'", "def"]: print tokens print "Error on line ", get_location()[0], "in", get_location()[1]
def generate_haiku_by_word(self): """Creates a dict with words and the haiku's in which these words occur. """ haiku_by_word = defaultdict(list) word_set = set([ word for haiku in self.haiku_list for word in tokenization.tokenize(haiku)[:-1] ]) for word in word_set: for haiku in self.haiku_list: if word in tokenization.tokenize(haiku): haiku_by_word[word].append(haiku) with open('haiku_by_word.pickle', 'wb') as f: pickle.dump(haiku_by_word, f)
def get_best_match(self, snippet): get_near_dups = self.simhash_index.get_near_dups generate_simhash = self.generate_simhash title_author_to_count = {} paras = extract_paragraphs(snippet) #evenly distribute the corrupted paragraphs #shuffle(paras) #For each paragraph, get the closest matching previously encountered paragraphs. #If multiple matches, prune via edit distance. #The work of art that matches the most paragraphs is the winner (if it matches enough) paras_done = 0 for para in paras: tokens = tokenize(para) if not tokens: continue paras_done += 1 sh = generate_simhash(tokens) candidates = [make_tuple(match) for match in get_near_dups(sh)] #Increment the count of these works for candidate in candidates: _, title, author, para_num = candidate k = (title, author) title_author_to_count[k] = title_author_to_count.get(k, 0) + 1 if title_author_to_count: #OK, what work was the most frequent, and what was that frequency? (title, author), f = max(title_author_to_count.iteritems(), key=lambda item: item[1]) score = 1.*f/paras_done if score >= 0.1: return {'title': title, 'author': author, 'score': score, 'author_score': None, 'completion': None} #This is either so corrupt that we can't tell what it is, or is a new work. #Guess the author tokens = [item for sublist in [tokenize(p) for p in paras] for item in sublist] author_guess, author_score = self.author_identifier.predict_author(tokens) completion = self.author_semantic_models.complete(author_guess, tokens, self.num_words_to_complete, 1) return {'title': None, 'author': author_guess, 'score': None, 'author_score': author_score, 'completion': completion}
def test_1_mais_2_menos_3(self): self.assertEqual([ ('N', '1'), ('+', '+'), ('N', '2'), ('-', '-'), ('N', '3') ], tokenize('1+2-3'))
def test_para_os_operadores_validos(self): self.assertEqual([ ('+', '+'), ('-', '-'), ('*', '*'), ('/', '/'), (':', ':'), ('(', '('), (')', ')'), ], tokenize('+-*/:()'))
def filter_and_tokenize(rows, lang): """Filters and tokenizes sentence pairs. For each row r, r[0] must be the target sentence and r[1] must be the source sentence. Returns rows that can be tokenized as sentences, with elements trg_tokenized and src_tokenized added at the end of each row. """ # Pre-tokenization filters: rows = tuple(r for r in rows if is_line(r[0]) and is_line(r[1])) rows = tuple(r for r in rows if '\xad' not in r[0] and '\xad' not in r[1]) # Tokenization: trgtok = tokenization.tokenize((r[0] for r in rows), lang) srctok = tokenization.tokenize((r[1] for r in rows), 'eng') rows = tuple(r + (t, s) for r, t, s in zip(rows, trgtok, srctok)) # Post-tokenization filters: rows = tuple(r for r in rows if r[-2] and r[-1]) return rows
def test_parses_single_quoted_string_literal(self): self.assertEqual( _string_literal_expression_parser( 0, tokenization.tokenize("'Hello, world'")), ( True, 1, FurStringLiteralExpression(string='Hello, world'), ), )
def buildInvertedIndex(hotel_data): '''This function is used for building the index of address to hotel. Index for the whole address include: region, street, city, tostal-code, locality. ''' name_result = {} address_result = {} score_result = {} for hotel_id in range(13000):#hotel_data: hotel_id = str(hotel_id) #for name to hotel index if hotel_id in hotel_data: print hotel_id#, hotel_data[hotel_id] l_name = tokenize(hotel_data[hotel_id]["name"]) for term in l_name: if term not in name_result.keys(): name_result[term] = [] if hotel_id not in name_result[term]: name_result[term].append(hotel_id) #for address to hotel index l_address = [] address_dic = hotel_data[hotel_id]["address"] for sub_address in address_dic:#handle each sub-address of the address: region, locality, street, postal-code l_address.extend(tokenize(address_dic[sub_address])) for term in l_address: if term not in address_result.keys(): address_result[term] = [] if hotel_id not in address_result[term]: address_result[term].append(hotel_id) #for scores average_scores = {} for review_id in hotel_data[hotel_id]["reviews"]: for aspect in hotel_data[hotel_id]["reviews"][review_id]["ratings"]: if aspect not in average_scores: average_scores[aspect] = 0 if hotel_data[hotel_id]["reviews"][review_id]["ratings"][aspect] != "": # print type(result[id]["reviews"][review_id]["ratings"][aspect]) average_scores[aspect] += float(hotel_data[hotel_id]["reviews"][review_id]["ratings"][aspect]) for average_score in average_scores: average_scores[average_score] /= len(hotel_data[hotel_id]["reviews"]) score_result[hotel_id] = average_scores return (name_result, address_result, score_result)
def get_tokenized_line(self): line = input("> ").strip() if len(line) > 0 and line[0].upper(): line = line[0].lower() + line[1:] line = list(filter(lambda x: x not in self.stopwords, tokenize(line))) line = [ self.morphosyntactic.get_dictionary().get(token, []) for token in line ] return line
def eval(s): tokens = tokenization.tokenize(s) if DEBUG: print("tokens:", tokens) tokens, tree = parse.parse(tokens) if DEBUG: print("tree:", tree) tree = parse.quote(tree) if DEBUG: print("post quote:", tree) return [core.evaluate({}, x) for x in tree]
def main(): TF = TFIDF() text = "" with open(sys.argv[1]) as f: for line in f: text += line words = tokenize(text, "word", return_spans=False) sentences = tokenize(text, "sentence", return_spans=False) wc = {} for word in words: word = TF.clean(word) if word is not None: wc[word] = wc.get(word, 0) + 1 tf_dict = {} for k in wc.keys(): tf_dict[k] = TF.weight(k, wc[k]) top = sorted(tf_dict.iteritems(), key = operator.itemgetter(1), reverse=True)[:15] for (k,v) in top: print k, v, TF.weight(k, wc[k], debug=True)
def test_parses_function_with_string_literal_argument(self): self.assertEqual( _function_call_expression_parser( 0, tokenization.tokenize("print('Hello, world')")), ( True, 4, FurFunctionCallExpression( name='print', arguments=(FurStringLiteralExpression( string='Hello, world'), ), ), ), )
def cleanText(filename): #preprocessing of text ''' Tokenize, Remove stopwords and reduce the words to their stem :param filename: path of file to be preprocessed ''' global sentence_dictionary,sentences readStopWords() sentence_dictionary,sentences = tokenize(filename) size = 0 for i in range(0, len(sentence_dictionary)): size += len(sentence_dictionary[i]) return sentence_dictionary, sentences, size
def main(): TF = TFIDF() text = "" with open(sys.argv[1]) as f: for line in f: text += line words = tokenize(text, "word", return_spans=False) sentences = tokenize(text, "sentence", return_spans=False) wc = {} for word in words: word = TF.clean(word) if word is not None: wc[word] = wc.get(word, 0) + 1 tf_dict = {} for k in wc.keys(): tf_dict[k] = TF.weight(k, wc[k]) top = sorted(tf_dict.iteritems(), key=operator.itemgetter(1), reverse=True)[:15] for (k, v) in top: print k, v, TF.weight(k, wc[k], debug=True)
def compute_dialogue_tf(self, text=None): if text is not None: dialogues = [dialogue.split("\n") for dialogue in text] else: dialogues = self.get_dialogues_lines() self.dialogue_term_frequency = [] for document_idx, document in enumerate(dialogues): for line_idx, line in enumerate(document): for term in tokenize(line): term = term.lower() if term.isalnum(): self._increase_dialogue_tf(term, line_idx, document_idx) return self.dialogue_term_frequency
def create_reverse_index(path_to_documents_collection, morphosyntactic): index = defaultdict(lambda: set()) with open(path_to_documents_collection, 'r', encoding='utf-8') as file: print("+++ creating reverse index +++") for line_number, line in enumerate(file): if line.startswith("#"): continue line = tokenize(line.split(":")[-1]) for token in line: base_tokens = morphosyntactic.get_dictionary().get(token, []) for base_token in base_tokens: index[base_token].add(line_number) print("+++ reverse index created +++") print(len(index)) return [index, []]
def nameToHotel(hotel_data): '''This function is used for building the index of hotel name to hotel id. Index for build from the name part. ''' result = {} for id in hotel_data: # print tokenize(hotel_data[id]["name"]) l = tokenize(hotel_data[id]["name"]) for term in l: if term not in result.keys(): result[term] = [] if id not in result[term]: result[term].append(id) # print result return result
def predict(vectoriser, model, text): # Predict the sentiment listD = [] listD = tokenize(str(text).lower()) textdata = vectoriser.transform(listD) sentiment = model.predict(textdata) # Make a list of text with sentiment. data = [] for text, pred in zip(text, sentiment): data.append((text, pred)) # Convert the list into a Pandas DataFrame. df = pd.DataFrame(data, columns=['text', 'sentiment']) df = df.replace([0, 1, 2], ["Negative", "Neutral", "Positive"]) print(df.sentiment) return df
def load_dialogues_from_file(document_path, *, do_tokenization=True, remove_authors=False): with open(document_path) as file: lines = file.readlines() dialogues_list = [line for line in lines if not line.startswith("#")] dialogues_list = "".join(dialogues_list) dialogues_list = dialogues_list.split("\n\n") if remove_authors: dialogues_list = [ "\n".join( [line.split(":")[-1] for line in dialogue.split("\n")]) for dialogue in dialogues_list ] if do_tokenization: dialogues_list = [tokenize(line) for line in dialogues_list] return dialogues_list
def add(self, doc, title, author): add_to_index = self.simhash_index.add #Index each paragraph in the document into the simhash index paras = extract_paragraphs(doc) #Update the word shape language model for this author para_toks = [tokenize(p) for p in paras] flat_tokens = [item for sublist in para_toks for item in sublist] self.author_semantic_models.add_doc(flat_tokens, author) #Update the semantic model for this author self.author_identifier.add_doc(flat_tokens, author) #Add each paragraph to the simhash index for para_num, tokens in enumerate(para_toks, 1): if not tokens: continue sh = self.generate_simhash(tokens) self.simhash_index.add((tokens, title, author, para_num), sh)
def emulate(): """ Listens for incoming POST request with emulation parameters :return: """ # TODO: this data = json.loads(request.data) number_of_token_bags = tokenize(PD=data.get('PD'), LGD=data.get('LGD'), credit_value=data.get('creditSum', 100), number_of_credits=data.get('creditsCount')) with open(settings.LOCK_FILE_NAME, 'w') as lockfile: if has_flock(lockfile): logger.warning('Could not acquire lock.') return Response(status=503) global process if process is not None: process.join() # to avoid zombie process emulation_uuid = uuid.uuid4() redis.set(str(emulation_uuid) + '__token_bags', number_of_token_bags) process = Process(target=run_emulation, kwargs=dict(url=settings.API_URL, emulation_uuid=emulation_uuid, assets=number_of_token_bags, meanmoney=data.get('meanmoney', 800), days=data.get('days'), yearreturn=data.get('placementRate'), meantargetreturn=data.get('placementRate'), nplaysers=data.get('peopleCount', 10))) process.start() return Response(json.dumps( {'result': { 'emulation_uuid': str(emulation_uuid) }}), status=200, content_type='application/json')
def run_extractor(doc_set, num_of_topics, num_of_words): raw = [] if type(doc_set) == str: doc_set = [doc_set] for doc in doc_set: # odd scenario where our document text is given in a single element list if type(doc) == list: doc = doc[0] raw_doc = doc.lower() # convert to tokens tokens = tokenize(raw_doc) # remove stop words from tokens stopped_tokens = remove_stops(tokens) # stem token stemmed_tokens = stem_words(stopped_tokens) # add to our list raw.append(stemmed_tokens) dictionary = corpora.Dictionary(raw) # convert to bag of words # tuples where tuple[0] is word key and tuple[1] is word occurence in given document corpus = [dictionary.doc2bow(text) for text in raw] ldamodel = models.ldamodel.LdaModel(corpus, num_topics=num_of_topics, id2word=dictionary, passes=20) return ldamodel.print_topics(num_topics=num_of_topics, num_words=num_of_words)
def construct(self): corpus = {} # Check to see if we should simply load a pickle if os.path.isfile(self.pickle_docs): with open(self.pickle_docs) as docs_file: current_doclist = pickle.load(docs_file) if os.listdir('articles/') == current_doclist: # current article list is the same as pickled article list # so we want to just load the stored pickled corpus data with open(self.pickle_corpus) as corpus_file: self.words = pickle.load(corpus_file) self.n = len(current_doclist) return # If we don't load a pickle, build the corpus from articles/ dir num_docs = 0.0 for file_name in os.listdir('articles/'): num_docs += 1 doc = {} with open("articles/" + file_name) as article: for line in article: for word in tokenize(line, "word", return_spans=False): word = self.clean(word) doc[word] = 1 for key in doc.keys(): corpus[key] = corpus.get(key, 0) + 1 self.words = corpus self.n = num_docs print "Pickling a new TFIDF corpus" # pickle corpus and document list with open(self.pickle_docs, "w") as docs_file: pickle.dump(os.listdir('articles/'), docs_file) with open(self.pickle_corpus, "w") as corpus_file: pickle.dump(self.words, corpus_file)
def corrupt_book(doc, prob_para, prob_tok, prob_mutate): """ params: prob_para : the probability a paragraph will be chosen to be corrupted prob_tok : the probability a token within a corrupted paragraph will be chosen for corruption prob_mutate : the probability the corruption will be a mutation. 1 minus this prob is the probability for deletion """ paras = extract_paragraphs(doc) paras = [tokenize(para) for para in paras] num_to_corrupt = int(round(len(paras)*prob_para)) if not num_to_corrupt: return doc inds_to_corrupt = choice(range(len(paras)), num_to_corrupt) for i in inds_to_corrupt: tokens = paras[i] num_toks_to_corrupt = int(round(len(tokens)*prob_tok)) if not num_toks_to_corrupt: continue tok_inds_to_corrupt = choice(range(len(tokens)), num_toks_to_corrupt) for i in tok_inds_to_corrupt: #Should we mutate it, or remove it? if random() > prob_mutate: #remove it tokens[i] = u'' else: #mutate it tokens[i] = u''.join(choice(chars_list, len(tokens[i]))) #collapse it all back down into a unicode document paras = [u' '.join(para) for para in paras] doc = u'\n\n'.join(paras) return doc
def __init__(self): # get a file to read words from user input_file = input('Please enter a file URL: ') SOURCE_TEXT = open(input_file) # open file self.counts = Heap() # initialize Heap object tokens = tokenize(SOURCE_TEXT) # create tokens from file hash_table = HashTable() # create hash table # insert tokens into hash table for i in range(len(tokens)): hash_table.insert(tokens[i]) # insert key value pairs into Heap for index in range(hash_table.num_of_buckets): if hash_table.buckets[index].is_empty(): continue else: temp = hash_table.buckets[index].head while temp is not None: pair = (temp.data[0], temp.data[1]) self.counts.insert(pair) temp = temp.next
def corrupt_book(doc, prob_para, prob_tok, prob_mutate): """ params: prob_para : the probability a paragraph will be chosen to be corrupted prob_tok : the probability a token within a corrupted paragraph will be chosen for corruption prob_mutate : the probability the corruption will be a mutation. 1 minus this prob is the probability for deletion """ paras = extract_paragraphs(doc) paras = [tokenize(para) for para in paras] num_to_corrupt = int(round(len(paras) * prob_para)) if not num_to_corrupt: return doc inds_to_corrupt = choice(range(len(paras)), num_to_corrupt) for i in inds_to_corrupt: tokens = paras[i] num_toks_to_corrupt = int(round(len(tokens) * prob_tok)) if not num_toks_to_corrupt: continue tok_inds_to_corrupt = choice(range(len(tokens)), num_toks_to_corrupt) for i in tok_inds_to_corrupt: #Should we mutate it, or remove it? if random() > prob_mutate: #remove it tokens[i] = u'' else: #mutate it tokens[i] = u''.join(choice(chars_list, len(tokens[i]))) #collapse it all back down into a unicode document paras = [u' '.join(para) for para in paras] doc = u'\n\n'.join(paras) return doc
os.makedirs(checkpoint_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) main_logger = init_logger(log_dir, f'finetune_main_{model_name}.log') # Import data test = pd.read_csv(f'{args.data_dir}test.csv') train = pd.read_csv(f'{args.data_dir}train.csv') # Min Max scale target after rank transformation for col in TARGETS: train[col] = train[col].rank(method="average") train[TARGETS] = MinMaxScaler().fit_transform(train[TARGETS]) y = train[TARGETS].values # Get model inputs ids_train, seg_ids_train = tokenize( train, pretrained_model_str=pretrained_models[model_name]) cat_features_train, _ = get_ohe_categorical_features( train, test, 'category') # Set training parameters device = 'cuda' num_workers = 10 n_folds = 10 lr = 1e-5 n_epochs = 10 bs = 2 grad_accum = 4 weight_decay = 0.01 loss_fn = nn.BCEWithLogitsLoss() # Start training
grammar_category.Number.SINGULAR: { grammar_category.Case.GENITIVE: 'mamuta', grammar_category.Case.VOCATIVE: 'mamucie', grammar_category.Case.NOMINATIVE: 'mamut', grammar_category.Case.ACCUSATIVE: 'mamuta', grammar_category.Case.LOCATIVE: 'mamucie', grammar_category.Case.DATIVE: 'mamutowi', grammar_category.Case.INSTRUMENTAL: 'mamutem'}, grammar_category.Number.PLURAL: { grammar_category.Case.GENITIVE: 'mamutów', grammar_category.Case.VOCATIVE: 'mamuty', grammar_category.Case.NOMINATIVE: 'mamuty', grammar_category.Case.ACCUSATIVE: 'mamuty', grammar_category.Case.LOCATIVE: 'mamutach', grammar_category.Case.DATIVE: 'mamutom', grammar_category.Case.INSTRUMENTAL: 'mamutami'}}, grammar_category.Gender.MASCULINE_INANIMATE, 1.)] pasta = tokenization.tokenize( "Mój stary to fanatyk wędkarstwa. Pół mieszkania zajebane wędkami najgorsze. Średnio raz w miesiącu ktoś " "wdepnie w leżący na ziemi haczyk czy kotwicę i trzeba wyciągać w szpitalu bo mają zadziory na końcu. W " "swoim 22 letnim życiu już z 10 razy byłem na takim zabiegu. Tydzień temu poszedłem na jakieś losowe " "badania to baba z recepcji jak mnie tylko zobaczyła to kazała buta ściągać xD bo myślała, że " "znowu hak w nodze.") morph = morphosyntactic.Morphosyntactic("polimorfologik-2.1.txt") morph.create_morphosyntactic_dictionary() replacer = Replacing(pasta, words, morph) assert "raz" in replacer.ignored_words assert "możliwość" in replacer.ignored_words print("".join(replacer.replace()))
import conversion import crossplatform_ir_generation import desugaring import c_generation import normalization import optimization import parsing import tokenization source_path = sys.argv[1] with open(source_path, 'r') as f: source = f.read() tokens = tokenization.tokenize(source) parsed = parsing.parse(tokens) desugared = desugaring.desugar(parsed) normalized = normalization.normalize(desugared) converted = conversion.convert(normalized) crossplatform_ir = crossplatform_ir_generation.generate(converted) optimized = optimization.optimize(crossplatform_ir) outputted = crossplatform_ir_generation.output(optimized) print(outputted) generated = c_generation.generate(optimized) assert source_path.endswith('.fur') destination_path = source_path + '.c'
def test_2_mais_2_menos_1(self): self.assertEqual([2,2,'+',1,'-'], parse(tokenize('2+2-1')))
def test_1_mais_1(self): self.assertEqual([1,1,'+'], parse(tokenize('1+1')))
def summarize(self, filename): text = "" with open(filename) as f: for line in f: text += line words = tokenize(text, "word", return_spans=False) sentences = tokenize(text, "sentence", return_spans=False) wc = {} clean_sentences = [] for sent in sentences: clean_sent = {} for word in tokenize(sent, "word", return_spans=False): word = self.TF.clean(word) clean_sent[word] = 1 wc[word] = wc.get(word, 0) + 1 clean_sentences.append(clean_sent) matrix = [] for word in wc.keys(): #print "adding", word row = [] for sent in clean_sentences: if word in sent: row.append(self.TF.weight(word, wc[word])) else: row.append(0) matrix.append(row) matrix = numpy.matrix(matrix) #print "matrix", matrix U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False) # print "U", U # print "s", s # print "Vh", Vh # D = s * Vh #print "D", D num_sentences = 5 summary_sentence_indices = [] #for topic in range(3): # print "Topic", topic # sent_weights = D[topic,:] # #top_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1], reverse=True)[:5] # bottom_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1])[:5] # #print "TOP:", ", ".join(wc.keys()[x[0]] for x in top_words) # print "BOTTOM WORDS:", ", ".join(wc.keys()[x[0]] for x in bottom_words) # top_sents = sorted(enumerate([s for s in sent_weights]), key = lambda x: x[1]) [:3] # print "TOP SENTS:", "\n".join([sentences[s[0]] for s in top_sents]) topic = 0 while len(summary_sentence_indices) < num_sentences: sent_weights = D[topic, :] top_sents = sorted(enumerate([s for s in sent_weights]), key=lambda x: x[1]) for sent in top_sents: if sent[0] > 0 and sent[0] not in summary_sentence_indices: summary_sentence_indices.append(sent[0]) break topic += 1 summary = "" summary_sentence_indices.sort() for i in summary_sentence_indices: summary += sentences[i] + "\n" return summary
def summarize(self, filename): text = "" with open(filename) as f: for line in f: text += line words = tokenize(text, "word", return_spans=False) sentences = tokenize(text, "sentence", return_spans=False) wc = {} clean_sentences = [] for sent in sentences: clean_sent = {} for word in tokenize(sent, "word", return_spans=False): word = self.TF.clean(word) clean_sent[word] = 1 wc[word] = wc.get(word, 0) + 1 clean_sentences.append(clean_sent) matrix = [] for word in wc.keys(): #print "adding", word row = [] for sent in clean_sentences: if word in sent: row.append(self.TF.weight(word, wc[word])) else: row.append(0) matrix.append(row) matrix = numpy.matrix(matrix) #print "matrix", matrix U, s, Vh = scipy.linalg.svd(matrix, full_matrices=False) # print "U", U # print "s", s # print "Vh", Vh # D = s * Vh #print "D", D num_sentences = 5 summary_sentence_indices = [] #for topic in range(3): # print "Topic", topic # sent_weights = D[topic,:] # #top_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1], reverse=True)[:5] # bottom_words = sorted(enumerate([u for u in U[:,topic]]), key = lambda x: x[1])[:5] # #print "TOP:", ", ".join(wc.keys()[x[0]] for x in top_words) # print "BOTTOM WORDS:", ", ".join(wc.keys()[x[0]] for x in bottom_words) # top_sents = sorted(enumerate([s for s in sent_weights]), key = lambda x: x[1]) [:3] # print "TOP SENTS:", "\n".join([sentences[s[0]] for s in top_sents]) topic = 0 while len(summary_sentence_indices) < num_sentences: sent_weights = D[topic,:] top_sents = sorted(enumerate([s for s in sent_weights]), key = lambda x: x[1]) for sent in top_sents: if sent[0] > 0 and sent[0] not in summary_sentence_indices: summary_sentence_indices.append(sent[0]) break topic += 1 summary = "" summary_sentence_indices.sort() for i in summary_sentence_indices: summary += sentences[i] + "\n" return summary
def build_dataset(self): if os.path.isfile(self.dataset_file): with open(self.dataset_file, "rb") as f: dataset = cPickle.load(f) else: dataset = SupervisedDataSet(len(features), 1) if os.path.isfile(self.done_articles_file): with open(self.done_articles_file, "rb") as f: done_articles = cPickle.load(f) else: done_articles = {} value = -1 decision = "y" for file_name in os.listdir(self.articles_dir): print "\n\n" print "---" * 10 decision = raw_input("Do another article? [y/n] ") if decision[0].lower() != "y": break with open("articles/" + file_name) as article: text = "" first = True for line in article.readlines()[1:]: text += line sentences = tokenize(text, "sentence", return_spans=False) article_position = done_articles.get(file_name, 0) if article_position >= len(sentences): continue print "Looking at:", file_name, "from position", article_position for sentence in sentences[article_position:]: extractor = FeatureExtractor(sentence) vectors = extractor.get_feature_vectors( features, "sentence")[0] print sentence value = -1 while value == -1: rating = raw_input("nothing=OK, space=bad, q=quit: ") if rating == "": value = [0] elif rating[:1].lower() == "q": value = None elif rating[:1] == " ": value = [1] # quit on q if value == None: break dataset.appendLinked(vectors, value) done_articles[file_name] = done_articles.get(file_name, 0) + 1 with open(self.dataset_file, "wb") as f: cPickle.dump(dataset, f) with open(self.done_articles_file, "wb") as f: cPickle.dump(done_articles, f)
def test_operador_desconhecido(self): with self.assertRaises(UnknownTokenException) as e: tokenize('1,1') self.assertEqual(e.exception.message, 1)
def build_dataset(self): if os.path.isfile(self.dataset_file): with open(self.dataset_file, "rb") as f: dataset = cPickle.load(f) else: dataset = SupervisedDataSet(len(features), 1) if os.path.isfile(self.done_articles_file): with open(self.done_articles_file, "rb") as f: done_articles = cPickle.load(f) else: done_articles = {} value = -1 decision = "y" for file_name in os.listdir(self.articles_dir): print "\n\n" print "---"*10 decision = raw_input("Do another article? [y/n] ") if decision[0].lower() != "y": break with open("articles/" + file_name) as article: text = "" first = True for line in article.readlines()[1:]: text += line sentences = tokenize(text, "sentence", return_spans=False) article_position = done_articles.get(file_name, 0) if article_position >= len(sentences): continue print "Looking at:", file_name, "from position", article_position for sentence in sentences[article_position:]: extractor = FeatureExtractor(sentence) vectors = extractor.get_feature_vectors(features, "sentence")[0] print sentence value = -1 while value == -1: rating = raw_input("nothing=OK, space=bad, q=quit: ") if rating == "": value = [0] elif rating[:1].lower() == "q": value = None elif rating[:1] == " ": value = [1] # quit on q if value == None: break dataset.appendLinked(vectors, value) done_articles[file_name] = done_articles.get(file_name, 0) + 1 with open(self.dataset_file, "wb") as f: cPickle.dump(dataset, f) with open(self.done_articles_file, "wb") as f: cPickle.dump(done_articles, f)
def test_2(self): self.assertEqual([('N', '2')], tokenize('2'))
# coding:utf-8 from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.keys import Keys import time import tokenization txt_file = "raw.txt" input_txt = tokenization.tokenize(txt_file) # input_txt = "哈哈哈" driver = webdriver.Safari() driver.maximize_window() driver.get(r'https://wordart.com/create') wait = WebDriverWait(driver, 10) time.sleep(1) # Problem here wait.until( EC.presence_of_element_located( (By.XPATH, "//*[@id='root']/div/div[2]/div[1]/div/div[3]"))) fonts = driver.find_element_by_xpath( "//*[@id='root']/div/div[2]/div[1]/div/div[3]") fonts.click() # wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='root']/div/div[2]/div[1]/div/div[3]/div[2]/div/ul/li[38]/div[1]/img"))) filter = driver.find_element_by_xpath(
for term in set.intersection(set(self.term_frequency[document_idx].keys()), set(idf.keys()))} return self.tf_idf if __name__ == "__main__": text = ["Ktoś:Witajcie, ludzie!", "Ktoś:Przybywamy do Was w pokoju i dobrej woli!", "Ktoś:Robot(człowiek) nie może skrzywdzić człowieka, ani przez zaniechanie dopuścić, by doznał on (człowiek) krzywdy.", "Ktoś:Roboty widziały rzeczy, o których Wam, ludziom, się nie śniło.", "Ktoś:Roboty to Twoi plastikowi kumple, z którymi fajnie jest przebywać.", "Ktoś:Roboty mają lśniące, metalowe tyłki, których nie należy gryźć.", "Ktoś:I mają \nKtoś inny:plan."] text = [tokenize(phrase) for phrase in text] morph = Morphosyntactic("data/polimorfologik-2.1.txt") tfidf = TF_IDF("data/tf_idf_test", morph) tfidf.compute(text) print("term_frequency", tfidf.term_frequency) print("document_frequency", tfidf.document_frequency) print("tf_idf", tfidf.tf_idf) saved_path = tfidf.save() del tfidf tf_idf = TF_IDF("data/tf_idf_test", morph).load() print(tf_idf) tf_idf = TF_IDF("data/drama_quotes_test.txt", morph)
def test_1(self): self.assertEqual([('N', '1')], tokenize('1'))
def test_menos(self): self.assertEqual([('-', '-')], tokenize('-'))
def test_mais(self): self.assertEqual([('+', '+')], tokenize('+'))
dataset = args.job_offer_file output_file = args.output_file def main(): # Read Random top 350K tweets top_tweets = pq.read_table(dataset).to_pandas() l = list(range(0,len(top_tweets))) # Select num_tweets randomly random_tweet_set = random.choices(l, k = num_tweets) df = pd.DataFrame(top_tweets,index = random_tweet_set) csv_file = open(output_file, mode='w') fieldnames = ['Tweet_ID', 'Text','Token', 'ORG', 'LOC', 'JOB_TITLE','Sector'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() for id in df.itertuples(index=False): index, text = id[0], id[2] tweet_tokens = tokenization.tokenize(text) for token in tweet_tokens: writer.writerow({'Tweet_ID': index, 'Text': text, 'Token':token, 'ORG':'', 'LOC':'', 'JOB_TITLE':'','Sector':''}) if __name__ == "__main__": main()
class Node: def __init__(self, key, did_occure): self.node_key = key self.node_next = None self.node_val = 0 self.occured = did_occure self.future_list = [] self.count = 0 if __name__ == '__main__': token_list = [] if(len(sys.argv)) > 1: token_list = tokenization.tokenize(sys.argv[1]) else: filename = input("Enter Filename: ") token_list = tokenization.tokenize(filename) my_graph = Markov_Chain() for i, k in enumerate(token_list): if my_graph.get(k) is None: my_graph.set(k, True) else: my_graph.get(k).occured = True my_graph.get(k).node_val += 1 if i < len(token_list) - 1: my_graph.update(k, token_list[i+1]) histo = sample.stochastic(my_graph)
def test_11_mais_11(self): self.assertEqual([ ('N', '11'), ('+', '+'), ('N', '11') ], tokenize('11+11'))
def test_2(self): self.assertEqual([2], parse(tokenize('2')))
def test_1_espaco_1(self): self.assertEqual([ ('N', '1'), ('N', '1') ], tokenize('1 1'))
def test_2_mais_2_vezes_1(self): self.assertEqual([2,2,1,'*','+'], parse(tokenize('2+2*1')))
def test_1_mais_1_com_espacos(self): self.assertEqual([ ('N', '1'), ('+', '+'), ('N', '1') ], tokenize('1 + 1'))
def test_2_mais_2(self): self.assertEqual([2,2,'+'], parse(tokenize('2+2')))
def test_1(self): self.assertEqual([1], parse(tokenize('1')))