def createTrainingVectors(tokenized_texts_dict): """ Given the filenames and their contents, this methods creates the training vectors by creating a unique list of all words together in the training set """ print("Creating vectors for training data") unique_words = [] for filename, text in tokenized_texts_dict.iteritems(): # print("Reading {0} and adding to unique word list".format(filename)) unique_words.extend(word_tokenize(text)) unique_words = set(unique_words) # Creating the initial vector with counts 0 for all training sets zero_vector = OrderedDict(zip(unique_words, [0] * len(unique_words))) print("Creating the zero vector") # For each training file, create an OrderedDict containing its word counts (together with zero counts), # and store it in a dict, indexed by its corresponding filename vectors = {} for filename, token_list in tokenized_texts_dict.iteritems(): current_vector = zero_vector.copy() current_vector.update(Counter(word_tokenize(token_list))) vectors[filename] = current_vector return vectors, zero_vector
def cleaned_bag_of_words_dataset(data_matrix, stemming=False, stop_words=None, TFIDF=False, ngram_range=(1, 1), max_features=None, length=False, number_in_tweet=False, words_present=[]): if stemming: stemmer = SnowballStemmer("english") tweets = [" ".join([stemmer.stem(word) for word in word_tokenize(data_point[2].lower().decode("utf8"))]) for data_point in data_matrix] else: tweets = [data_point[2].lower() for data_point in data_matrix] if TFIDF: vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=ngram_range, max_features=max_features) else: vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=ngram_range, max_features=max_features) dataset = vectorizer.fit_transform(tweets).toarray() if length: lengths = np.array([[len(word_tokenize(data_point[2].decode("utf8")))] for data_point in data_matrix]) dataset = np.concatenate((dataset, lengths), axis=1) if number_in_tweet: numbers = [] for data_point in data_matrix: number_list = list_of_ints_from_string(data_point[2]) filtered_number_list = [number for number in number_list if abs(number) < 10] if len(filtered_number_list) == 0: numbers.append([0]) else: numbers.append([np.mean(filtered_number_list)]) dataset = np.concatenate((dataset, numbers), axis=1) for word in words_present: word_present = np.array([[int(word.lower() in word_tokenize(data_point[2].lower().decode("utf8")))] for data_point in data_matrix]) dataset = np.concatenate((dataset, word_present), axis=1) return dataset
def tokenize_sentences(filename): file_dir = docs_dir + str(filename) f = open(file_dir, 'r') root = ET.parse(f).getroot() tags = root.getiterator('str') # read the relevant tags title_string = '' desc_string = '' for tag in tags: if tag.get('name') == 'Title' : title_string = filter(lambda x: x in string.printable, tag.text.lower().strip()) elif tag.get('name') == 'Abstract': desc_string = filter(lambda x: x in string.printable, tag.text.lower().strip().replace('relevant documents will describe', '')) f.close() sentences = sent_tokenize(title_string) title_words = [] for s in sentences: title_words = title_words + word_tokenize(s) sentences = sent_tokenize(desc_string) desc_words = [] for s in sentences: desc_words = desc_words + word_tokenize(s) return (title_words, desc_words)
def search(dictionary_file, postings_file, query_file, output_file): try: # Remove previous output file os.remove(output_file) except OSError: pass inverted_index = InvertedIndex(dictionary_file, postings_file) meta_data = get_meta_data() tree = ET.parse(query_file) root = tree.getroot() title_tokens = [] description_tokens = [] raw_tokens = [] for child in root: if child.tag == 'title': title_tokens = build_tokens(child.text) raw_tokens.extend(word_tokenize(child.text)) elif child.tag == 'description': description_tokens = build_tokens(child.text) raw_tokens.extend(word_tokenize(child.text)) raw_tokens = helper.remove_stop_words_without_normalize(helper.filter_invalid_characters(raw_tokens)) additional_tokens = [] for token in list(set(raw_tokens)): additional_tokens.extend(helper.get_similar_words(token)) title_tokens = helper.remove_stop_words(helper.filter_invalid_characters(title_tokens)) description_tokens = helper.remove_stop_words(helper.filter_invalid_characters(description_tokens)) # tight results are results which favour high precision. We use this as a proxy for true positive tight_results = execute_query(title_tokens, description_tokens, [], inverted_index, meta_data) global top_UPC_classes global top_IPC_classes global top_family_members global top_cited_by # Get top UPC, IPC, family members and cited by from our true positive proxy results # This helps us determine which documents are more similar to the original top results # when we add in the additional similar words top_UPC_classes = get_top_classes(tight_results, meta_data['UPC_class'], 6) top_IPC_classes = get_top_classes(tight_results, meta_data['IPC_class'], 4) top_family_members = get_top_members(tight_results, meta_data['family_members'], 20) top_cited_by = get_top_members(tight_results, meta_data['cited_by'], 20) # query expansion # supplementary_results = expand_query(tight_results, meta_data['doc_top_terms'], inverted_index, meta_data) # synonyms, hypernyms additional_tokens = helper.normalize_tokens(list(set(additional_tokens))) results = execute_query(title_tokens, description_tokens, additional_tokens, inverted_index, meta_data) k = int(TOP_X_PERCENT_RESULTS * len(results)) # j = int(TOP_X_PERCENT_RESULTS * len(supplementary_results)) # results = list(set(results[:k] + supplementary_results[:j])) write_to_output(output_file, results[:k])
def max_similarity(context_sentence, ambiguous_word, option="path", lemma=True, context_is_lemmatized=False, pos=None, best=True): """ Perform WSD by maximizing the sum of maximum similarity between possible synsets of all words in the context sentence and the possible synsets of the ambiguous words (see http://goo.gl/XMq2BI): {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))} """ ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None if context_is_lemmatized: context_sentence = word_tokenize(context_sentence) else: context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)] result = {} for i in wn.synsets(ambiguous_word): try: if pos and pos != str(i.pos()): continue except: if pos and pos != str(i.pos): continue result[i] = sum(max([sim(i,k,option) for k in wn.synsets(j)]+[0]) \ for j in context_sentence) if option in ["res","resnik"]: # lower score = more similar result = sorted([(v,k) for k,v in result.items()]) else: # higher score = more similar result = sorted([(v,k) for k,v in result.items()],reverse=True) ##print result if best: return result[0][1]; return result
def main(): # Load up txt files speech_file = open('trump-speeches/speeches.txt').read() tweets = json.load(open('trump_tweets.json')) tweet_list = [] for tweet in tweets: tweet_list.append(tweet['text']) tweet_list = ' '.join(tweet_list) # Tokenize logging.info('Formatting training text') speech_token = word_tokenize(speech_file) tweet_token = word_tokenize(tweet_list) # Train trigram models logging.info('Setting up models') speech_gram, speech_format = ngram(speech_token, 3) tweet_gram, tweet_format = ngram(tweet_token, 3) # Generate responses cont = True while cont: response = input("Hello sir, what can I Trumpinate for you?: ") num_words = input("And how many words should I write?: ") # Print Phrases gen_phrase(speech_gram, int(num_words), starter_word=[response]) print('') gen_phrase(tweet_gram, int(num_words), starter_word=[response]) more = input("Would you like to generate more? (Yes, No): ") if more != 'Yes': cont = False
def get_cluster(s1, s2, dataset): """ Return "cluster" (i.e. video or picture name) that the sentences came from """ if dataset == 'FLICKR': data_reverse = flickr_reverse sent_1 = ' '.join(word_tokenize(s1)) sent_2 = ' '.join(word_tokenize(s2)) else: data_reverse = msr_reverse sent_1 = s1 sent_2 = s2 if s1 not in data_reverse: return None if s2 not in data_reverse: return None candidates_1 = set(data_reverse[s1]) candidates_2 = set(data_reverse[s2]) if len(candidates_1 & candidates_2) > 0: return list(candidates_1 & candidates_2)[0]
def getBigramBeginWithNotCount(sent): negative_keywords = ["bad", "sad", "don't", "could not", "crappy", "unfortunately", "remove", "why", "poor", "bothersome", "terrible", "although", "complaints", "outrageous", "isn't", "poorly", "drawback", "annoying", "against", "irritating", "wouldn't", "won't", "wasn't", "couldn't", "awful", "didn't", "hasn't", "difficult", "hate", "incorrect", "junk", "trash", "removed", "complain", "complained", "hated", "negative"] bigramPostiveCount = 0 ''' from nltk.corpus import brown brown_tagged_sents = brown.tagged_sents(categories='news') brown_sents = brown.sents(categories='news') unigram_tagger = nltk.UnigramTagger(brown_tagged_sents) for bigram in nltk.bigrams(word_tokenize(sent)): if bigram[0].lower() == "not" and bigram[1].lower() in negative_keywords: print sent print bigram print unigram_tagger.tag(word_tokenize(sent)) bigramNotCount += 1 ''' for i, word in enumerate(word_tokenize(sent)): if word.lower() == "not": if word_tokenize(sent)[i + 1] in negative_keywords : # e.g. NOT bad bigramPostiveCount += 1 if i < len(word_tokenize(sent)) - 2 and word_tokenize(sent)[i + 2] in negative_keywords: # e.g. NOT too bad bigramPostiveCount += 1 else: # e.g. NOT good bigramPostiveCount -= 1 return bigramPostiveCount
def word_standardize(sentences): tokens = [] sentences_st = [] for sent in sentences: tokens.extend(word_tokenize(sent)) sentences_st.append(word_tokenize(sent)) words = tokens st = LancasterStemmer() words = [w.lower() for w in words] words = [w for w in words if not w in stopwords.words('english')] words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'] st_words = [st.stem(w) for w in words] sent_result = [] for sent in sentences_st: sent = [w.lower() for w in sent] sent = [w for w in sent if not w in stopwords.words('english')] sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'] sent_result.append(sent) return st_words, sent_result
def test(testAccents, testNoAccents, dictnoAccents): count = 0 correct = 0 notWord = [] result = [] incorrect = {} wordCount = 0 nonWordCount = 0 for i in range(len(testAccents)): sent = "" sentenceAccents = testAccents[i] sentenceNoAccents = testNoAccents[i] tokensAccents = word_tokenize(sentenceAccents) tokensNoAccents = word_tokenize(sentenceNoAccents) if len(tokensAccents) == len(tokensNoAccents): for j in range(len(tokensAccents)): tA = tokensAccents[j] tNA = tokensNoAccents[j] if tNA not in punctuation and not tNA.isdigit(): wordCount +=1 if tNA in dictnoAccents.keys(): newToken = max(dictnoAccents[tNA], key=dictnoAccents[tNA].get) #print(newToken) #print("YES") else: newToken = tNA if newToken == tA: correct +=1 else: incorrect[newToken] = tA # print(newToken) # print(tA) count +=1 #print("HI") if j != 0: newToken = " " + newToken else: nonWordCount +=1 notWord.append(tNA) newToken = tNA sent = sent + newToken result.append(sent) print("Le nombre de mot dans le corpus: " + str(wordCount) ) print("Le nombre de ponctuation et de nombres dans le corpus: " + str(nonWordCount)) print("Nombre au total de changements/non changements possibles " + str(count )) print("Nombre au total de decisions correctes " + str(correct)) print("Accuracy: " + str(correct/count) ) return([incorrect,correct/count, wordCount, nonWordCount])
def load_anssel_samples(qtext, atexts): samples = [] qtext = word_tokenize(qtext) for atext in atexts: atext = word_tokenize(atext) samples.append({'qtext': ' '.join(qtext), 'label': 0, 'atext': ' '.join(atext)}) return samples
def load_data(loc='./data/'): """ Load MSRP dataset """ trainloc = os.path.join(loc, 'msr_paraphrase_train.txt') testloc = os.path.join(loc, 'msr_paraphrase_test.txt') trainA, trainB, testA, testB = [],[],[],[] trainS, devS, testS = [],[],[] f = open(trainloc, 'rb') for line in f: text = line.strip().split('\t') trainA.append(' '.join(word_tokenize(text[3]))) trainB.append(' '.join(word_tokenize(text[4]))) trainS.append(text[0]) f.close() f = open(testloc, 'rb') for line in f: text = line.strip().split('\t') testA.append(' '.join(word_tokenize(text[3]))) testB.append(' '.join(word_tokenize(text[4]))) testS.append(text[0]) f.close() trainS = [int(s) for s in trainS[1:]] testS = [int(s) for s in testS[1:]] return [trainA[1:], trainB[1:]], [testA[1:], testB[1:]], [trainS, testS]
def write_anotations_to_file(lst_annotation, file_name): with codecs.open(file_name, 'w', 'utf-8') as f: for annotation in lst_annotation: annotation_full_text = annotation.text car_name = preprocessor_text(annotation.name) annotation_start = annotation_full_text.find(car_name) annotation_end = annotation.start + len(car_name) full_text_before_annotation = preprocessor_text(annotation_full_text[:annotation_start].strip()) before_tokens = word_tokenize(full_text_before_annotation) for token in before_tokens: f.write( token + u' ' + u'O' + u'\n' ) annotation_tokens = word_tokenize(car_name) for idx, token in enumerate(annotation_tokens): if idx == 0: label = u'B' else: label = u'I' f.write( token + u' ' + label + u'\n' ) full_text_after_annotation = preprocessor_text(annotation_full_text[annotation_end:]).strip() after_tokens = word_tokenize(full_text_after_annotation) for token in after_tokens: f.write( token + u' ' + u'O' + '\n' ) f.write( u'\n' )
def tokenize(s, stem=True, digit=False, stop=True, use_re=False): """ :type s: str :type stem: bool :type use_re: bool :rtype: set(str) """ stop_words = stopwords.words('english') stemmer = SnowballStemmer('english') wordnet = WordNetLemmatizer() table = string.maketrans("","") if use_re: s = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s) if digit: tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation + string.digits))) else: tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation))) if stop: tokens = set(word for word in tokens if word not in stop_words) if stem: tokens = set(stemmer.stem(word) for word in tokens) return tokens
def load_ace_file(textfile, fmt): print ' - %s' % os.path.split(textfile)[1] annfile = textfile+'.tmx.rdc.xml' # Read the xml file, and get a list of entities entities = [] xml = ET.parse(open(annfile)).getroot() for entity in xml.findall('document/entity'): typ = entity.find('entity_type').text for mention in entity.findall('entity_mention'): if mention.get('TYPE') != 'NAME': continue # only NEs s = int(mention.find('head/charseq/start').text) e = int(mention.find('head/charseq/end').text)+1 entities.append( (s, e, typ) ) # Read the text file, and mark the entities. text = open(textfile).read() # Strip XML tags, since they don't count towards the indices text = re.sub('<(?!/?TEXT)[^>]+>', '', text) # Blank out anything before/after <TEXT> def subfunc(m): return ' '*(m.end()-m.start()-6) text = re.sub('[\s\S]*<TEXT>', subfunc, text) text = re.sub('</TEXT>[\s\S]*', '', text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = set(typ for (s,e,typ) in entities) # Binary distinction (NE or not NE) if fmt == 'binary': i = 0 toks = Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree('NE', text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == 'multiclass': i = 0 toks = Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree(typ, text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks else: raise ValueError('bad fmt value')
def best_bigrams(sents_tagged, stopwords, score_fn=BigramAssocMeasures.likelihood_ratio, n=300): sents_pos = [] sents_neg = [] # Separate positive and negative sentences. for tag, sent in sents_tagged: if tag == 1: sents_pos.append(sent) elif tag == -1: sents_neg.append(sent) # Extract words from positive and negative sentences. words_pos = [word.lower() for s in sents_pos for word in word_tokenize(s) if word not in string.punctuation] words_neg = [word.lower() for s in sents_neg for word in word_tokenize(s) if word not in string.punctuation] # Find the best bigrams for positive sentences based on informative collocations bigram_finder1 = BigramCollocationFinder.from_words(words_pos) bigrams_best_pos = bigram_finder1.nbest(score_fn, n) # Find the best bigrams for negative sentences based on informative collocations bigram_finder2 = BigramCollocationFinder.from_words(words_neg) bigrams_best_neg = bigram_finder2.nbest(score_fn, n) bigrams_all = list(set(bigrams_best_pos).union(set(bigrams_best_neg))) # Select only the bigrams that have either one of the word greater than length 3 bigrams_best = [bigram for bigram in bigrams_all if len(bigram[0]) > 3 and len(bigram[1]) > 3 and bigram[0] not in ex and bigram[1] not in ex ] return bigrams_best
def __next__(self): if self.sentence == False: # we will treat one document = one hansard statement res = cursor.fetchone() if res == None: raise StopIteration else: x = self.sentenceHandler(res) x=word_tokenize(x) # tokenize # optional stemmer #wnl = EnglishStemmer() #lemmed = [] #for word in x: # newword=wnl.stem(word) # lemmed.append(newword) y=doStop(x) # remove stopwords and procedural words x=doProcedural(y) return (x) else: # we will treat one document = one sentence if self.paragraphInProgress==False: # this is a new paragraph, so fetch it res = cursor.fetchone() if res == None: raise StopIteration else: # new paragraph fetched successfully self.paragraphInProgress==True x = self.sentenceHandler(res) self.workingParagraph = sent_tokenize(x) doc = self.workingParagraph.pop(0) doc=word_tokenize(doc) # tokenize y=doStop(doc) # remove stopwords and procedural words x=doProcedural(y) # before we end, check whether this was a one-sentence paragraph if len(self.workingParagraph)==0: self.paragraphInProgress==False return (x) elif self.paragraphInProgress==True: # we have already started a paragraph with list of sentences, so pop the first one and yield it as tokens # if length becomes 0 at the end, reset the paragraphInProgress flag doc = self.workingParagraph.pop(0) doc=word_tokenize(doc) # tokenize y=doStop(doc) # remove stopwords and procedural words x=doProcedural(y) # before we end, check whether this was a one-sentence paragraph if len(self.workingParagraph)==0: self.paragraphInProgress==False return (x)
def sum_basic(lines, word_limit, update_non_redundency=True): def weight(sents, distribution): def _weight_sent(sent): tokens = preprocess(word_tokenize(sent)) return reduce(lambda x,y: x+y, [distribution.get(x) for x in tokens]) / len(tokens) return [_weight_sent(sent) for sent in sents] def probability_distribution(tokens): N = len(tokens) distinct_words = set(tokens) probabilities = map(lambda w: tokens.count(w) / N , distinct_words) return dict(zip(distinct_words, probabilities)) sents = to_sents(lines) tokens = to_tokens(sents) tokens = preprocess(tokens) pd = probability_distribution(tokens) summary = "" while len(word_tokenize(summary)) < word_limit: weights = weight(sents, pd) highest_weight_sentence = max(zip(sents, weights), key=itemgetter(1))[0] summary += " " + highest_weight_sentence if update_non_redundency: for token in preprocess(word_tokenize(highest_weight_sentence)): pd[token] = pd[token] * pd[token] else: sents.remove(highest_weight_sentence) return summary
def sentence_matches(self, sentence_text): """Returns true iff the sentence contains this mention's upstream and downstream participants, and if one of the stemmed verbs in the sentence is the same as the stemmed action type.""" has_upstream = False has_downstream = False has_verb = False # Get the first word of the action type and assume this is the verb # (Ex. get depends for depends on) actiontype_words = word_tokenize(self.mention.actiontype) actiontype_verb_stemmed = stem(actiontype_words[0]) words = word_tokenize(sentence_text) if self.string_matches_sans_whitespace(sentence_text.lower(), self.mention.upstream.lower()): has_upstream = True if self.string_matches_sans_whitespace(sentence_text.lower(), self.mention.downstream.lower()): has_downstream = True for word in words: if actiontype_verb_stemmed == stem(word): has_verb = True return has_upstream and has_downstream and has_verb
def post(self): args = parser.parse_args() text = {'text': args['text']} print text print sent_tokenize(text['text']) print word_tokenize(text['text']) return text['text']
def get_doc_abstract_query_List(norm): ranked_top_10_doc_list = map(operator.itemgetter(0), ranked_scores_top_10) result_query = "" count = 0 synonym_words_list = [] for docID in ranked_top_10_doc_list: if dir_of_docs.endswith("/"): docID_file_dir = dir_of_docs + docID + ".xml" else: docID_file_dir = dir_of_docs + "/" + docID + ".xml" xml_doc = Document(docID, docID_file_dir) title = xml_doc.get_title() result_query += title + " " """ if count < 1: # Only get abstract from top document(s) result_query += xml_doc.get_abstract() + " " """ # Adds synonyms for the top ranked document's title to new query if count <= 10: title_words = word_tokenize(title) for w in title_words: synonym_words_list = norm.combine_list(synonym_words_list, norm.get_synonym_list(w)) count += 1 result_query_list = word_tokenize(result_query) result_query_list = norm.combine_list(result_query_list, synonym_words_list) normalized = norm.normalize_tokens(result_query_list) return normalized
def calculate_pmi_use_case2(self, schema): print("Calculating PMI for " + schema) corpus_count = 0 text = [] for item in self.__mongo_db.get(schema, {}): text += word_tokenize(item['text'], language='german') corpus_count += len(word_tokenize(item['text'], language='german')) print(corpus_count) counter = Counter(text) single_pattern_table = self.__postgre_db.get_data_from_table(schema, "bscale_single_pattern") # counting single pattern occurrences for item in single_pattern_table: word = item['single_pattern'] count = counter[word] self.__postgre_db.update(schema, "bscale_single_pattern", "count=" + str(count), "single_pattern=" + add_quotes(word)) # pmi calculation co_occ_table = self.__postgre_db.get_data_from_table(schema, "correlating_pattern") for item in co_occ_table: item_id = item['id'] co_occ_freq = float(item['count'] / corpus_count) word1_id = item['pattern_a'] word2_id = item['pattern_b'] word1_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word1_id), "count") print(word1_occ) word2_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word2_id), "count") print(word2_occ) pmi = log2(co_occ_freq / (float(word1_occ / corpus_count) * float(word2_occ / corpus_count))) print(pmi) self.__postgre_db.update(schema, "correlating_pattern", "pmi=" + str(pmi), "id=" + str(item_id))
def clean_raw_txt(body, headline, punct_dct=None, stopwrds_set=None): """Clean the body and headline to remove punctuation, stopwords, etc. Args: ---- body: str headline: str punct_dct (optional): dict Translation dict resulting from a `str.maketrans()` call stopwords_set (optional): set Return: ------ (body, headline): tuple """ if punct_dct: body = body.translate(punct_dct) headline = headline.translate(punct_dct) body_wrds = word_tokenize(body) headline_wrds = word_tokenize(headline) stopwrds_set = set() if stopwrds_set is None else stopwrds_set body_wrds = [wrd.lower() for wrd in body_wrds if wrd.lower() not in stopwrds_set] headline_wrds = [wrd.lower() for wrd in headline_wrds if wrd.lower() not in stopwrds_set] return (body_wrds, headline_wrds)
def _doc2vec_doc_stream(paths, n, sentences=True): """ Generator to feed sentences to the dov2vec model. """ phrases = Bigram() i = 0 p = Progress() for path in paths: with open(path, 'r') as f: for line in f: i += 1 p.print_progress(i/n) # We do minimal pre-processing here so the model can learn # punctuation line = line.lower() if sentences: for sent in sent_tokenize(line): tokens = word_tokenize(sent) yield LabeledSentence(phrases[tokens], ['SENT_{}'.format(i)]) else: tokens = word_tokenize(line) yield LabeledSentence(phrases[tokens], ['SENT_{}'.format(i)])
def __init__(self, txt_type: str, txt: str): self.txt_type = txt_type if txt_type is "paragraph": self.sentences = [word_tokenize(w) for w in sent_tokenize(txt)] else: self.title = word_tokenize(txt)
def load_sick2014(dsfile, mode='relatedness'): """ load a dataset in the sick2014 tsv .txt format; mode='relatedness': use the sts relatedness score as label mode='entailment': use -1 (contr.), 0 (neutral), 1 (ent.) as label """ s0 = [] s1 = [] labels = [] with open(dsfile) as f: first = True for line in f: if first: # skip first line with header first = False continue line = line.rstrip() pair_ID, sentence_A, sentence_B, relatedness_score, entailment_judgement = line.split('\t') if mode == 'relatedness': label = float(relatedness_score) elif mode == 'entailment': if entailment_judgement == 'CONTRADICTION': label = -1 elif entailment_judgement == 'NEUTRAL': label = 0 elif entailment_judgement == 'ENTAILMENT': label = +1 else: raise ValueError('invalid label on line: %s' % (line,)) else: raise ValueError('invalid mode: %s' % (mode,)) labels.append(label) s0.append(word_tokenize(sentence_A)) s1.append(word_tokenize(sentence_B)) return (s0, s1, np.array(labels))
def load_samples(question, prop_labels): samples = [] q = word_tokenize(question) for label in prop_labels: text = word_tokenize(label.lower()) samples.append({'qtext': ' '.join(q), 'label': 0, 'atext': ' '.join(text)}) return samples
def testing(): # - tokenize on sentence and word ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!" print(sent_tokenize(ex_txt)) print(word_tokenize(ex_txt, language='english')) # - stop words (pre-defined by nltk) stop_words = set(stopwords.words('english')) print(stop_words) words = word_tokenize(ex_txt) print(words) filtered_sent = [] for w in words: if w not in stop_words: filtered_sent.append(w) print(filtered_sent) filtered_sent = [w for w in words if not w in stop_words] print(filtered_sent) # - stemming ps = PorterStemmer() example_words = [python,pythoner,pythoning,pythoned,pythonly] # for w in example_words: # print(ps.stem(w)) new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once." words = word_tokenize(new_text) for w in words: print(ps.stem(w))
def load_anssel(dsfile, subsample0=3): """ load a dataset in the anssel csv format; subsample0=N denotes that only every N-th 0-labelled sample should be loaded; so e.g. N=3 reduces 80k negatives to 28k negatives in the training set (vs. 4k positives); N=10k gets you just 8k negatives, etc. """ s0 = [] s1 = [] labels = [] i = 0 with open(dsfile) as f: c = csv.DictReader(f) for l in c: label = int(l['label']) if label == 0 and (i % subsample0) != 0: i += 1 continue labels.append(label) try: qtext = l['qtext'].decode('utf8') atext = l['atext'].decode('utf8') except AttributeError: # python3 has no .decode() qtext = l['qtext'] atext = l['atext'] s0.append(word_tokenize(qtext)) s1.append(word_tokenize(atext)) i += 1 return (s0, s1, np.array(labels))
def obtaindata(pos_file,neg_file): ##read the input files short_pos = open(pos_file, "r").read() short_neg = open(neg_file, "r").read() documents = [] # documents is gonna be a list of tuples that have a line of review and a class (pos or neg) for r in short_pos.split('\n'): documents.append((r, "pos")) for r in short_neg.split('\n'): documents.append((r, "neg")) all_words = [] # gonna contain all the words in both corpuses combined (nonunique) short_pos_words = word_tokenize(short_pos) short_neg_words = word_tokenize(short_neg) for w in short_pos_words: all_words.append(w.lower()) for w in short_neg_words: all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:5000]#gets the top 5000 most common words to use as features featuresets = [(find_features(rev,word_features), category) for (rev, category) in documents] random.shuffle(featuresets) return featuresets
from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize from nltk.corpus import stopwords ps = PorterStemmer() example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"] for w in example_words: print(ps.stem(w)) new_text = "John jumped and was jumping to jump but jumps often jumply" print("un stemmed = ", new_text) words = word_tokenize(new_text) stop_words = set(stopwords.words("english")) words_no_stop_words = [] for w in words: if w not in stop_words: words_no_stop_words.append(w) print("New Text with no stop words = ", words_no_stop_words) for words in words_no_stop_words: print(" Stemmed = ", ps.stem(words))
def ret(text): text = word_tokenize(text.lower()) return [w for w in text if w not in self.stopset + self.punct]
from nltk.stem.lancaster import LancasterStemmer import os import string import pickle from gensim import corpora, models, similarities import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) raw_material = [ line.strip() for line in open('material.txt', 'r').read().split( '-=this is the spread line=-') ] texts_tokenized = [[ word.lower() for word in word_tokenize(document.decode('utf-8')) ] for document in raw_material] english_stopwords = stopwords.words('english') texts_filtered_stopwords = [[ word for word in document if not word in english_stopwords ] for document in texts_tokenized] texts_filtered = [[ word for word in document if not word in string.punctuation ] for document in texts_filtered_stopwords] st = LancasterStemmer() texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered]
for id_summary in range(130, 1615, 1): create_users_table = """select description from summary where id = """ + str( id_summary) + """;""" res = execute_read_query(connect, create_users_table) for i in res: text1 = i[0] corpus = [] filtered = [] normal = [] normal1 = [] text1 = "".join([ch for ch in text1 if ch not in string.punctuation ]) #удаляем знаки препинания corpus = word_tokenize(text1, language="russian") #делит предложения на слова for i in corpus: if i in stop_words: corpus.remove( i ) #удаляет стоп слова (предлоги например) надо список слов вручную доработать for token in corpus: a = corpus.index(token) corpus[a] = (morph.parse(token)[0].normal_form ) #приводит слова к их нормальной форме experience = 0 op = corpus.count("опыт") if (op > 0):
return True def on_error(self, status): print(status) return True twitter_stream = Stream(auth, MyListener()) twitter_stream.filter(track=['#dengue', 'dengue'], languages =['pt']) import pandas as pd from nltk.tokenize import word_tokenize df = pd.read_json("dados/spotify.json", orient = 'records', lines = True) #Otima opcao de leitura word_tokenize(" ".join(df['text']), language = 'portuguese') import json with open('python.json', 'r') as f: line = f.readline() # read only the first tweet/line tweet = json.loads(line) # load it as Python dict print(json.dumps(tweet, indent=4)) # pretty-print tweet['text'] from textblob import TextBlob as tb
def normalize_document(pathname, filename): document_words = dict() path = os.path.join(pathname, filename) with open(path, 'r') as document: for line in document: sentence_to_normalize = line.strip() if len(sentence_to_normalize) == 0: continue print_coloured_bold( '\nSentence to stem: ' + sentence_to_normalize + '\n', "red") #removing m-dash sentence_to_normalize = sentence_to_normalize.replace("–", " ").lower() sentence_to_normalize = re.sub("-{2,}", "", sentence_to_normalize) #removing contract forms if ("'t" in sentence_to_normalize): sentence_to_normalize = sentence_to_normalize.replace("'t", "") #tokenization word_tokens = word_tokenize(sentence_to_normalize) #punctuation removal word_tokens_filtered = [ w for w in word_tokens if not w in punctuation and not w == "'s" ] #skip if punctuation within words (except -./) or split if / within word word_tokens_noslash = list() for w in word_tokens_filtered: if not any(char in punctuation.replace("-", "").replace( ".", "").replace("/", "") for char in w): if "/" in w: words = w.split("/") for split in words: if not split == "": word_tokens_noslash.append(split) else: word_tokens_noslash.append(w) #leave acronyms and split others in case of . word_tokens_dot = list() regex = re.compile('(?:[a-z]\.){2,}') for w in word_tokens_noslash: if (w + "." in sentence_to_normalize and regex.match(w + ".")): word_tokens_dot.append(w) elif ("." in w): words = w.split(".") for split in words: if not split == "": word_tokens_dot.append(split) else: word_tokens_dot.append(w) #stopwords removal (done before stemming, less words to stem) stop_words = set(stopwords.words('english')) no_stopwords_sentence = [ w for w in word_tokens_dot if not w in stop_words ] #digits removal sentence_words_nodigits = [ w for w in no_stopwords_sentence if not w.isdigit() ] #roman numerals removal regex = re.compile('^(?=[MDCLXVI])M*D?C{0,4}L?X{0,4}V?I{0,4}$') no_roman_numerals_sentence = [ w for w in sentence_words_nodigits if not regex.match(w) ] #one letter words removal sentence_words_nosingleletters = [ w for w in no_roman_numerals_sentence if not len(w) < 2 ] print_coloured_bold("Stop words result", "cyan") print(sentence_words_nosingleletters) print('\n') #stemming stemmer = TreeTagger( path_to_treetagger='/home/biar/Desktop/ProgettoWIR/treetagger') for word in sentence_words_nosingleletters: stem = stemmer.tag(word) if not (stem[0][1] == "CRD"): if not stem[0][2] == '<unknown>': if '|' in stem[0][2]: first_word = ((stem[0][2]).split('|'))[0] stem[0][2] = first_word if (len(first_word) > 1): w = correct_stemming(stem).lower() if not w in document_words: document_words[w] = 1 else: document_words[w] += 1 else: if (len((stem[0][2]).lower()) > 1): w = correct_stemming(stem).lower() if not w in document_words: document_words[w] = 1 else: document_words[w] += 1 else: w = (stem[0][0]).lower() if not w in document_words: document_words[w] = 1 else: document_words[w] += 1 return document_words
if token in stopwords.words('english'): clean_tokens.remove(token) freq = nltk.FreqDist(clean_tokens) for key,val in freq.items(): print (str(key) + ':' + str(val)) freq.plot(20,cumulative=False) from nltk.tokenize import sent_tokenize mytext = "Hello Adam, how are you? I hope everything is going well. Today is a good day, see you dude." print(sent_tokenize(mytext)) from nltk.tokenize import sent_tokenize mytext = "Hello Mr. Adam, how are you? I hope everything is going well. Today is a good day, see you dude." print(sent_tokenize(mytext)) from nltk.tokenize import word_tokenize mytext = "Hello Mr. Adam, how are you? I hope everything is going well. Today is a good day, see you dude." print(word_tokenize(mytext)) from nltk.tokenize import sent_tokenize mytext = "Bonjour M. Adam, comment allez-vous? J'espère que tout va bien. Aujourd'hui est un bon jour." print(sent_tokenize(mytext,"french")) from nltk.corpus import wordnet syn = wordnet.synsets("pain") print(syn[0].definition()) print(syn[0].examples()) from nltk.corpus import wordnet syn = wordnet.synsets("NLP") print(syn[0].definition()) syn = wordnet.synsets("Python") print(syn[0].definition())
stemmer = PorterStemmer() lemmatizer = WordNetLemmatizer() uniquewords = {} labels = [0] * len(titles) for j in range(len(titles)): labels[j] = hash_labels[labelsTrain[j]] temp = titles[j].lower() temp = re.sub(r'\d+', '', temp) tempstr = "" for char in temp: if char not in string.punctuation: tempstr += char temp = tempstr temp = temp.strip() temp = temp.replace('\n', ' ') t = word_tokenize(temp) temp = [k for k in t if not k in stop_words] temp2 = [stemmer.stem(word=word) for word in temp] temp3 = [lemmatizer.lemmatize(word=word) for word in temp2] titles[j] = ' '.join(temp3) for word in temp3: if word in uniquewords: uniquewords[word] += 1 else: uniquewords[word] = 1 tfidf_vectorizer = TfidfVectorizer(use_idf=True) unique_word_count_vectorizer = tfidf_vectorizer.fit_transform(titles) X_train, X_test, Y_train, Y_test = train_test_split( unique_word_count_vectorizer, labels, test_size=0.2, random_state=109) gnb = MultinomialNB() gnb.fit(X_train.toarray(), Y_train)
from nltk.corpus import stopwords from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory import pickle filename = 'TFIDF.pickle' pickle.dump(TFIDF, open(filename, 'wb')) kalimat = "PERANCANGAN SISTEM PAKAR UNTUK DIAGNOSA PENYAKIT ANAK" lower_case = kalimat.lower() print (lower_case) punctuation = lower_case.translate(str.maketrans('','',string.punctuation)).strip() print(punctuation) tokenize = word_tokenize(punctuation) print(tokenize) factory = StopWordRemoverFactory() stopword = factory.create_stop_word_remover() hasil_stopword = [] for i in tokenize: word = stopword.remove(i) if word !='': hasil_stopword.append(word) print(hasil_stopword) factory = StemmerFactory() stemmer = factory.create_stemmer()
def clean_sentence_stopwords(sentence): tokens = word_tokenize(sentence) sentence_clean = [w for w in tokens if not w in stopwords] sentence_clean = ' '.join(sentence_clean) return sentence_clean
print('Document file: ' + in_file + ' Saved in ' + out_path) #dispalys to the user the location of the saved file #Text Preprocessing print('File :' + in_file + ' submitted for preprocessing') #change end of sentence character to # #Enables reconstruction of sentences after stop words removal text_file = text_file.replace('.\n\n', '#') text_file = re.sub('[^a-zA-Z0-9\n#]', ' ', text_file) #remove special and unwanted characters text_file = text_file.lower( ) #convert all text to lower case; facilitates stop word removal #remove stop words and converts words to their stem/root stop_words = set(stopwords.words('english')) # define stopwords object word_tokens = word_tokenize( text_file) #split text to words, word_tokens is a list #removes all stopwords from word tokens, filtered_sentence is a list #Alternative code for removal of stopwords using "a list comprehension" #filtered_sentence = [w for w in word_tokens if not w in stop_words] #define a list to hold test whose stop words have been removed filtered_sentence = [] for w in word_tokens: if w not in stop_words: #stemword is user defined function #Stemword takes words back to their stem/root filtered_sentence.append(stemword(w)) text_file = ' '.join(filtered_sentence) # convert from list to string
def autocorrect(sent): t_text = word_tokenize(sent) t_text = [spell(word) for word in t_text] input_text = ' '.join(t_text) #print(input_text) return input_text
def show_entry_fields(): #------------------------------------------------------------ #add to dictionary def addtodict(list,k): for i in list: if i in dict: if k in dict[i][-1]: #print(k) dict[i][-1][1]=dict[i][-1][1]+1 else: dict[i] = dict[i]+[[k,1]] else: dict[i]=[[k,1]] # ------------------------------------------------ #tokenize query def tokenize_query(s): global extras_list query1 = word_tokenize(s) query = [lemmatizer.lemmatize(w.lower()) for w in query1 if not w in extras_list] #print(query) return query #--------------------------------------------------------- def find_intersection(lower_list,query): intersection=[] for i in lower_list: #print(i) if i in query: if i in intersection: continue else: intersection.append(i) else: continue #print(intersection) return intersection #------------------------------------------------------------- def document_tfidf(document,intersection,k): weight=[] for i in intersection: x=len(dict[i]) for j in range(0,x): if(dict[i][j][0]==k): weight.append(1+math.log10(dict[i][j][1])) for i in document: if i in intersection: continue else: x=len(dict[i]) for j in range(0,x): if(dict[i][j][0]==k): weight.append(1+math.log10(dict[i][j][1])) #weighted values before normalization #print(weight) n_value = normalise(weight) n=0 for k in weight: weight[n]=k/n_value n=n+1 return weight #-------------------------------------------------------------- def minimize_doc(document): listed = [] for i in document: if i in listed: continue else: listed.append(i) return listed #------------------------------------------------------------ def normalise(normal): sum = 0 for k in normal: sum = sum + (k*k) value = math.sqrt(sum) return float(value) #-------------------------------------------------- # for a query def add_query(query): for i in query: if i in query_dict: query_dict[i]=query_dict[i]+1 else: query_dict[i]=1 #--------------------------------------------------------------- #--------------------------------------------------------------- def query_tf(query_listed): for i in query_listed: q_tf[i] = 1+math.log10(query_dict[i]) #--------------------------------------------------------------- def query_idf(query_listed): for i in query_listed: if i not in dict: q_idf[i] = 0 else: q_idf[i] = math.log10((doc_count/len(dict[i]))) #--------------------------------------------------------------- #heap sort def heapify(arr, n, i): largest = i l = 2 * i + 1 r = 2 * i + 2 if l < n and arr[i] < arr[l]: largest = l if r < n and arr[largest] < arr[r]: largest = r if largest != i: arr[i],arr[largest] = arr[largest],arr[i] heapify(arr, n, largest) def heapSort(arr): n = len(arr) for i in range(n, -1, -1): heapify(arr, n, i) for i in range(n-1,-1, -1): arr[i], arr[0] = arr[0], arr[i] heapify(arr, i, 0) #--------------------------------------------------------------- f=[] #print(query_listed) #print(query_dict) #print("tf of query") #----------------------------------------------------- for (dirpath,dirnames,filenames) in walk('G:/Users/avina/Desktop/lol'): f.extend(filenames) doc_count = len(f) for k in f: print(k) fo = open(k,"r+",encoding="utf8") data = fo.read() list = word_tokenize(data) lower_list = [lemmatizer.lemmatize(w.lower()) for w in list if not w in extras_list] addtodict(lower_list,k) fo.close() #------------------------------------------------------------------------------- s = e1.get() # INPUT HERE<-------------------------------S--------------------------------------> query = tokenize_query(s) add_query(query) query_listed = minimize_doc(query) query_tf(query_listed) #------------------------------------------------------------------------------- query_idf(query_listed) #idf of query #print(q_idf) tf_idf = {} for i in q_tf: tf_idf[i] = (q_tf[i]*q_idf[i]) #tf idf value before normalization #print(tf_idf) sum = 0 for i in tf_idf: sum = sum+tf_idf[i]*tf_idf[i] #normalized value value = math.sqrt(sum) #print(value) for i in tf_idf: if value==0: print("No documents found") else: tf_idf[i]=tf_idf[i]/value #---------------------------------------------- #normalized tf idf value #print(tf_idf) # tf_idf dictionary contains the tf_idf values of query #------------------------------------------------------------ #------------------------------------------------------------- for i in query_listed: if i in dict: local_len = len(dict[i]) for j in range(0,local_len): file_name = dict[i][j][0] if file_name in document: continue else: document.append(dict[i][j][0]) else: continue #print(document) #------------------------------------------------------------- for k in document: fo = open(k,"r+",encoding="utf8") # print(k) data = fo.read() list = word_tokenize(data) lower_list = [lemmatizer.lemmatize(w.lower()) for w in list if not w in extras_list] fo.close() listed = minimize_doc(lower_list) intersection = find_intersection(lower_list,query) # print("intersection values") # print(intersection) weight = document_tfidf(listed,intersection,k) # print("weighted values of document after normalization") # print(weight) total = [] ins_len = len(intersection) l=0 cosine = 0 for i in intersection: total.append(weight[l]*tf_idf[i]) cosine = cosine+total[l] l = l+1 # tf idf of common values between document and query # print(total) # total cosine value # print(cosine) cosine_list.append(cosine) cosine_dict[cosine] = k #print(cosine_dict) n = len(cosine_list) heapSort(cosine_list) #print(cosine_list) count=0 # print("First Name: %s\nLast Name: %s" % (e1.get(), e2.get())) ll = [] for i in range(n-1,-1,-1): if count>10: break else: f1=cosine_list[i] ll.append(cosine_dict[f1]) count = count+1 b1=Label(root, text=ll[0],bg='lightblue',font='10',height=2,width=15).grid(row=5) b2=Label(root, text=ll[1],bg='lightblue',font='10',height=2,width=15).grid(row=6) b3=Label(root, text=ll[2],bg='lightblue',font='10',height=2,width=15).grid(row=7) b4=Label(root, text=ll[3],bg='lightblue',font='10',height=2,width=15).grid(row=8) b5=Label(root, text=ll[4],bg='lightblue',font='10',height=2,width=15).grid(row=9) b6=Label(root, text=ll[5],bg='lightblue',font='10',height=2,width=15).grid(row=10) b7=Label(root, text=ll[6],bg='lightblue',font='10',height=2,width=15).grid(row=11) b8=Label(root, text=ll[7],bg='lightblue',font='10',height=2,width=15).grid(row=12) b9=Label(root, text=ll[8],bg='lightblue',font='10',height=2,width=15).grid(row=13) b10=Label(root, text=ll[9],bg='lightblue',font='10',height=2,width=15).grid(row=14)
def change_token(texts): tokens = word_tokenize(texts) return tokens
import nltk import io from nltk.tokenize import sent_tokenize, word_tokenize with io.open('filename.txt', 'r', encoding="UTF8") as myfile: data=myfile.read().replace('\n', '') text = word_tokenize(data) finished = nltk.pos_tag(text) print(finished)
def my_tokenizer(doc): text = word_tokenize(doc) tokens_without_sw= [word for word in text if not word in all_stopwords] return tokens_without_sw
def on_data(self, data): try: tweet1=data.split(',"text":"')[1] tweet2=tweet1.split(',"source":"')[0] tweet3=tweet2.split('https:')[0] tweet4=tweet3.split(':')[1] tweet5=tweet4.replace('RT','') tweet6=tweet5.replace(':','') tweett=tweet6.replace('@','') xx=detect(tweett) if(xx=='en'): saveFile = open('CristianoGame.txt','a') saveFile.write(tweett) saveFile.write('\n') saveFile.close() words=word_tokenize(tweett) #print(words) #print('\n') fili=[] for w in words: if w not in stop_words: fili.append(w) fili2=[] #print(fili) # print("TABDIL") for w in fili: fili2.append(lemmatizer.lemmatize(w)) #print(fili2) #further checking end=time.clock() zaman=end-start str1=" ".join(str(e) for e in fili2) #print(str1) analysis=TextBlob(str1) analysis2=s.sentiment(str1) adad=analysis.sentiment.polarity print(adad) #print(adad) #xar[0]=x #print(zaman) x=adad y=zaman liste.append(adad) niste.append(zaman) print(len(liste)) #print("HI") #print(ii) #print("HI2") #alpha=open('file2.txt','a') #print >>alpha, adad #print("HI") #saveFile.write(tweett) #xar.append(x) #print(zaman) #yar.append(y) #ax1.clear() #ax1.plot(xar,yar) #print("HI") if(len(liste)==5000): print(liste[0]) print(" 150 ta !!! " ) print(np.mean(liste)) print("AVERAGE") print(sum(liste)) print("SUM") print(zaman) plt.plot(niste, liste,'-o') plt.title('Cristiano-1500') plt.xlabel('Time(sec)') plt.ylabel('SentimentValue') plt.show() if(len(liste)==30000): print(liste[0]) print(" 300 ta !!! " ) print(np.mean(liste)) print("AVERAGE") print(sum(liste)) print("SUM") print(zaman) plt.plot(niste, liste,'-o') plt.title('Messi-300') plt.xlabel('Time(sec)') plt.ylabel('SentimentValue') plt.show() if(len(liste)==3000): print(liste[0]) print(" 500 ta !!! " ) print(np.mean(liste)) print("AVERAGE") print(sum(liste)) print("SUM") print(zaman) plt.plot(niste, liste,'-o') plt.title('Cristiano-500') plt.xlabel('Time(sec)') plt.ylabel('SentimentValue') plt.show() if(len(liste)==11000): print(liste[0]) print(" 1000 ta !!! " ) print(np.mean(liste)) print("AVERAGE") print(sum(liste)) print("SUM") print(zaman) plt.plot(niste, liste,'-o') plt.title('Cristiano-1000') plt.xlabel('Time(sec)') plt.ylabel('SentimentValue') plt.show() if(len(liste)==20000): print(liste[0]) print(" 2000 ta !!! " ) print(np.mean(liste)) print("AVERAGE") print(sum(liste)) print("SUM") print(zaman) plt.plot(niste, liste,'-o') plt.title('Messi-2000') plt.xlabel('Time(sec)') plt.ylabel('SentimentValue') plt.show() if(len(liste)==30000): print(liste[0]) print(" 3000 ta !!! " ) print(np.mean(liste)) print("AVERAGE") print(sum(liste)) print("SUM") print(zaman) plt.plot(niste, liste,'-o') plt.title('Messi-3000') plt.xlabel('Time(sec)') plt.ylabel('SentimentValue') plt.show() #alaf = open('twik.txt','a') ''' for p in liste: alaf.write("%f\n" % p) #alaf.write('\n') ''' #alaf.close() #for itm in liste: # print>> #print(len(liste)) return True except: return True
file=open("/home/owner/PhD/dr.norbert/dataset/shorttext/agnews/semisupervised/agnewsraw_ensembele_train","r") lines = file.readlines() file.close() train_data = [] train_labels = [] train_trueLabels = [] train_textdata = [] for line in lines: line=line.lower().strip() arr = re.split("\t", line) train_data.append(arr[2]) word_tokens = word_tokenize(arr[2]) train_textdata.append(word_tokens) train_labels.append(arr[0]) train_trueLabels.append(arr[1]) list_toktextdatas.append(train_textdata) #file=open("D:/PhD/dr.norbert/dataset/shorttext/stackoverflow/semisupervised/stackoverflowraw_ensembele_test","r") file=open("/home/owner/PhD/dr.norbert/dataset/shorttext/agnews/semisupervised/agnewsraw_ensembele_test","r") #file=open("D:/PhD/dr.norbert/dataset/shorttext/data-web-snippets/semisupervised/data-web-snippetsraw_ensembele_test","r") #file=open("D:/PhD/dr.norbert/dataset/shorttext/biomedical/semisupervised/biomedicalraw_ensembele_test","r") lines = file.readlines() file.close()
X_train = [] Y_train = [] count = 0 for l in lines: # if count > 100: # break # count += 1 x, y = l.split(' ') Y_train.append(y) temp = open(path2 + x, 'r') temp = temp.read() # tokens=wordpunct_tokenize(str(temp)) # tokens = [w for w in tokens if not w in stop_words] # doc = [word for word in tokens if word in model.wv.vocab] # doc_mean = np.mean(model.wv[doc], axis=0) temp = word_tokenize(temp.lower()) v = model.infer_vector(temp) X_train.append(v) # In[5]: print("train file input and preprocess") with open(path3 + 'temp.txt') as f: lines = f.readlines() X_test = [] Y_test = [] count = 0 for l in lines: # if count > 100:
def main(): parser = argparse.ArgumentParser( description='Synthetic divergent data creation') parser.add_argument('--debug', help='debug mode', action='store_true') parser.add_argument('--data', help='input positive examples') parser.add_argument('--output', help='output directory of synthetic training data', default='synthetic') parser.add_argument( '--mode', help= 'how data examples are generated (p: parallel, u:uneven, i:insert, r:replace d:delete', default='i') parser.add_argument('--pretrained_bert', help='pretrained bert', default='bert-base-cased') parser.add_argument( '--bert_local_cache', help='path to local directory where pretrained bert is saved') o = parser.parse_args() d = synthetic_divergences() # Create directory for bert local cache if not os.path.exists(o.bert_local_cache): os.makedirs(o.bert_local_cache) pos_to_wrd = defaultdict(list) indices = [] with io.open(o.data, 'r', encoding='utf-8', newline='\n', errors='ignore') as f: i = 0 n_total = 0 for line in f: n_total += 1 if n_total % 100000 == 0: if n_total % 1000000 == 0: sys.stderr.write(str(n_total)) else: sys.stderr.write(".") indices.append(i) tok = line.strip('\n').split("\t") src = tok.pop(0).strip().split(' ') tgt = tok.pop(0).strip().split(' ') ali = tok.pop(0).strip().split(' ') src = word_tokenize(' '.join(src)) tagged_sent = nltk.pos_tag(src) words, tags = zip(*tagged_sent) pos = list(tags) d.add(src, tgt, pos, ali) pos_phrases_ngrams(src, pos, pos_to_wrd) i += 1 # Configure write mode and output files write_mode = 'w' output_path = os.path.join(o.output, 'from_{0}'.format(str(o.data.split('/')[-1]))) # Create output directories try: os.makedirs(output_path) except FileExistsError: sys.stderr.write('Warning: Output file already exists\n') if 'g' in o.mode: lm_model = BertForMaskedLM.from_pretrained( o.pretrained_bert, cache_dir=o.bert_local_cache) lm_tokenizer = BertTokenizer.from_pretrained( o.pretrained_bert, cache_dir=o.bert_local_cache) output_g = open(os.path.join(output_path, 'generalization'), write_mode) output_g_span = open(os.path.join(output_path, 'generalization.span'), write_mode) if 'p' in o.mode: lm_model = BertForMaskedLM.from_pretrained( o.pretrained_bert, cache_dir=o.bert_local_cache) lm_tokenizer = BertTokenizer.from_pretrained( o.pretrained_bert, cache_dir=o.bert_local_cache) output_p = open(os.path.join(output_path, 'particularization'), write_mode) output_p_span = open( os.path.join(output_path, 'particularization.span'), write_mode) if 'i' in o.mode: output_i = open(os.path.join(output_path, 'insert'), write_mode) output_i_span = open(os.path.join(output_path, 'insert.span'), write_mode) if 'u' in o.mode: output_u = open(os.path.join(output_path, 'uneven'), write_mode) output_u_span = open(os.path.join(output_path, 'uneven.span'), write_mode) if 'd' in o.mode: output_d = open(os.path.join(output_path, 'delete'), write_mode) output_d_span = open(os.path.join(output_path, 'delete.span'), write_mode) if 'r' in o.mode: output_r = open(os.path.join(output_path, 'replace'), write_mode) output_r_span = open(os.path.join(output_path, 'replace.span'), write_mode) for i in indices: # Insert sentence if 'i' in o.mode: synthetic_pair = d.insert_pair(i, o) if synthetic_pair: output_i.write('{0}\t{1}\n'.format(' '.join( synthetic_pair[0]), ' '.join(synthetic_pair[1]))) output_i_span.write('{0}\t{1}\n'.format( ' '.join(synthetic_pair[2]), ' '.join(synthetic_pair[3]))) else: output_i.write(none_) output_i_span.write(none_) # Random pairing of sentences if 'u' in o.mode: synthetic_pair = d.uneven_pair(i, o) if synthetic_pair: output_u.write('{0}\t{1}\n'.format(' '.join( synthetic_pair[0]), ' '.join(synthetic_pair[1]))) output_u_span.write('{0}\t{1}\n'.format( ' '.join(synthetic_pair[2]), ' '.join(synthetic_pair[3]))) else: output_u.write(none_) output_u_span.write(none_) # Create lexical substitution (generalization) instance if 'g' in o.mode: synthetic_pair = d.generalization_pair(i, o, lm_model, lm_tokenizer) if synthetic_pair: output_g.write('{0}\t{1}\n'.format(' '.join( synthetic_pair[0]), ' '.join(synthetic_pair[1]))) output_g_span.write('{0}\t{1}\n'.format( ' '.join(synthetic_pair[2]), ' '.join(synthetic_pair[3]))) else: output_g.write(none_) output_g_span.write(none_) # Create lexical substitution (particularization) instance if 'p' in o.mode: synthetic_pair = d.particularization_pair(i, o, lm_model, lm_tokenizer) if synthetic_pair: output_p.write('{0}\t{1}\n'.format(' '.join( synthetic_pair[0]), ' '.join(synthetic_pair[1]))) output_p_span.write('{0}\t{1}\n'.format( ' '.join(synthetic_pair[2]), ' '.join(synthetic_pair[3]))) else: output_p.write(none_) output_p_span.write(none_) # Create subtree deletion instance if 'd' in o.mode: synthetic_pair = d.delete_pair(i, o) if synthetic_pair: output_d.write('{0}\t{1}\n'.format(' '.join( synthetic_pair[0]), ' '.join(synthetic_pair[1]))) output_d_span.write('{0}\t{1}\n'.format( ' '.join(synthetic_pair[2]), ' '.join(synthetic_pair[3]))) else: output_d.write(none_) output_d_span.write(none_) # Create phrase replacement instance if 'r' in o.mode: synthetic_pair = d.replace_pair(i, o, pos_to_wrd) if synthetic_pair: output_r.write('{0}\t{1}\n'.format(' '.join( synthetic_pair[0]), ' '.join(synthetic_pair[1]))) output_r_span.write('{0}\t{1}\n'.format( ' '.join(synthetic_pair[2]), ' '.join(synthetic_pair[3]))) else: output_r.write(none_) output_r_span.write(none_)
sumat_v1 = sumat_v1**0.5 if sumat_v1 == 0: sumat_v1 = 1 sumat_v2 = sumat_v2**0.5 if sumat_v2 == 0: sumat_v2 = 1 measure = sumat_pto / (sumat_v1 * sumat_v2) return measure directorio = os.listdir('Datosrank/') query = "31483txt start project gutenberg ebook gaslight sonata produced suzanne shell" stop_words = set(stopwords.words('english')) tok_query = word_tokenize(query) final_query = [word for word in tok_query if not word in stop_words] with open('pal.json') as json_file: pal = json.load(json_file) print("Acabe la larga espera pal") with open('queryw.json') as json_file: queryw = json.load(json_file) print("Acabe la larga espera query") listidx = [] vectorw_query = [] for word in final_query: idx = pal.index(word)
) # membuang kata yang hanya satu huruf dari awal tmp = re.sub(r'\s+', ' ', str(fitur_ekstraksi2[cuitan]) ) # mengganti spasi ganda dengan spasi tunggal fitur_ekstraksi3.append(tmp) fitur_ekstraksi5 = [] for cuitan in range(0, len(fitur_ekstraksi3)): tmp = word_tokenize(str(fitur_ekstraksi3[cuitan])) fitur_ekstraksi5.append(tmp) return fitur_ekstraksi5 stopsunda1 = open('stopwordv1.txt', 'r') stopsunda2 = stopsunda1.read() stopsunda = word_tokenize(stopsunda2) def swr(a, b): filtered_sentence = [] for w in a: if w not in b: filtered_sentence.append(w) return filtered_sentence callbackvalue = preprocessing(fitur) def stopw(datanext): fitur_ekstraksistop = []
def get_syllable_count(text): return (sum(map(lambda w: allnumsyllables(w), word_tokenize(text))))
if not os.path.isfile('featureVectorForSentence.csv'): open('featureVectorForSentence.csv', 'w') with open('featureVectorForSentence.csv', 'w') as featuresFile: featuresFile.write('') s = LancasterStemmer() unwantedWordes = [ 'the', 'a', 'is', 'was', 'are', 'were', 'to', 'at', 'i', 'my', 'on', 'me', 'of', '.', 'in', 'that', 'he', 'she', 'it', 'by' ] for i in range(0, a - 1): lexicon_dictionary[i][0] = s.stem(lexicon_dictionary[i][0]) for x in sentences: featureVector = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] words = word_tokenize(x) for y in words: y = s.stem(y) y = y.lower() if y in unwantedWordes != -1: continue for i in range(0, a - 1): if y == lexicon_dictionary[i][0]: for j in range(0, 10): featureVector[j] = featureVector[j] + int( lexicon_dictionary[i][j + 1]) break # write this feature vector to featureVectors File for k in range(0, 9): with open('featureVectorForSentence.csv', 'a') as featuresFile: featuresFile.write(str(featureVector[k]) + ',')
def find_features(document, word_features): words = word_tokenize(document) features = {} for w in word_features: features[w] = (w in words) return features
def get_word_count(text): word_count=0 filterwords=filter(not_punctuation, word_tokenize(text)) for word in filterwords: word_count=word_count+1 return word_count
from bs4 import BeautifulSoup from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import numpy as np import matplotlib.pyplot as plt from scipy import spatial from sklearn.manifold import TSNE from multiprocessing import Array from bs4 import BeautifulSoup fi = 'test_data.txt' printable = set(string.printable) # stop words from nltk stop_words = set(stopwords.words("english")) file_con = open(fi).read().lower() file_content = filter(lambda x: x in printable, file_con) example_words = word_tokenize(file_content) # removing punctuations example_words = filter(lambda x: x not in string.punctuation, example_words) # removing stop_words cleaned_text = filter(lambda x: x not in stop_words, example_words) print(cleaned_text) cleaned_t = " ".join(cleaned_text) f = open('cleaned_test_data.txt', 'w') f.write(cleaned_t)
def text_statistics(text): word_count = get_word_count(text) sent_count = get_sent_count(text) syllable_count = sum(map(lambda w: allnumsyllables(w), word_tokenize(text))) return word_count, sent_count, syllable_count
def s2s_preprocess(train_file_name, test_file_name): raw_sentences = list() ontology_results = list() max_length = 0 for one_line in open(train_file_name): one_line = one_line.strip() print(one_line) if len(one_line.split("\t")) != 2: continue raw_sentence = one_line.split("\t")[0] ontology_string = one_line.split("\t")[1] tokenized_list = word_tokenize(raw_sentence) if len(tokenized_list) > max_length: max_length = len(tokenized_list) ontology_tuple = ontology_string.split() if len(ontology_tuple) != 3: continue raw_sentences.append(tokenized_list) ontology_results.append(ontology_tuple) token_voc_list = list() ontology_voc_list = list() for one_raw_sentence in raw_sentences: for one_token in one_raw_sentence: token_voc_list.append(one_token) token_voc_list = list(set(token_voc_list)) for one_ontology_result in ontology_results: for one_ontology in one_ontology_result: ontology_voc_list.append(one_ontology) ontology_voc_list = list(set(ontology_voc_list)) token_idx_dict, idx_token_dict = dictionary_generator(token_voc_list, eos_flag=False) ontology_idx_dict, idx_ontology_dict = dictionary_generator(ontology_voc_list, oov_flag=False) token_store_data = list() for one_raw_sentence in raw_sentences: token_store_data.append(data_indexer(one_raw_sentence, token_idx_dict)) ontology_store_data = list() for one_ontology_result in ontology_results: ontology_store_data.append(data_indexer(one_ontology_result, ontology_idx_dict)) pretrained_dict = dict() print("Loading pretrained Word2Vec model ...") w2v_embedding_path = "data/w2v/wiki20170101" w2v_model = Word2Vec.load(w2v_embedding_path) for one_line in open(train_file_name): one_line = one_line.strip() if len(one_line.split("\t")) != 2: continue raw_sentence = one_line.split("\t")[0] tokenized_list = word_tokenize(raw_sentence) for one_token in tokenized_list: if one_token not in w2v_model.wv.vocab: continue pretrained_dict[one_token] = w2v_model[one_token] for one_line in open(test_file_name): one_line = one_line.strip() if len(one_line.split("\t")) != 2: continue raw_sentence = one_line.split("\t")[0] tokenized_list = word_tokenize(raw_sentence) for one_token in tokenized_list: if one_token not in w2v_model.wv.vocab: continue pretrained_dict[one_token] = w2v_model[one_token] processed_data = (token_idx_dict, idx_token_dict, ontology_idx_dict, idx_ontology_dict, pretrained_dict, token_store_data, ontology_store_data, raw_sentences, ontology_results) pickle.dump(processed_data, open("data/preprocessed/20180405.pkl", "wb"))
def __tokenize(self, a_txt: str = ''): self.Tokens = word_tokenize(a_txt) print(self.Tokens)