def redact_gender(contents): #List of gendered words genders = [ 'he', "hes", 'her', 'she', "shes", 'him', 'his', 'woman', 'man', 'lady', 'ladies', 'girl', 'boy', 'women', 'men', 'son', "son's", 'daughter', "daughters", 'father', "fathers", 'mother', 'sister', 'brother', 'herself', 'himself', "mothers", 'female', 'male' ] redacted = [] #list to return, holds redacted tex redact = [] #list that holds words to redact #Tokenize text into sentences default_st = nltk.sent_tokenize sentences = default_st(text=contents) for sentence in sentences: #Tokenize sentence into words ws = WhitespaceTokenizer() words = ws.tokenize(sentence) #Ignores upper/lower case and punction marks for word in words: w = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf8') if ((w.translate(str.maketrans('', '', string.punctuation)).casefold() in genders)): redact.append(word) redacted = redact_items(redact, contents) return (redacted, redact)
def clean_review(review: str) -> str: """Clean a review of unnecessary symbols.""" stop_words = set(stopwords.words("english")) tokenizer = WhitespaceTokenizer() lemmatizer = WordNetLemmatizer() def get_wordnet_pos(word: str) -> str: """Get wordnet pos (part of speech) tags.""" word_and_tag = pos_tag([word])[0] tag = word_and_tag[1] short_tag = tag[0] tag_dict = { "J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, } return tag_dict.get(short_tag, wordnet.NOUN) review = re.sub("<.*?>", " ", review) review = review.translate( str.maketrans(string.punctuation + string.digits, 42 * " ") ) review_tokens = tokenizer.tokenize(review) lower_review_tokens = (token.lower() for token in review_tokens) review_tokens = (token for token in lower_review_tokens if token not in stop_words) review_lemmas = ( lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in review_tokens ) return " ".join(review_lemmas)
def main(question: str, candicates: List[str]) -> None: init() # Step 1: 分词 tokenizer = WhitespaceTokenizer() question_tokens = tokenizer.tokenize(question) candicates_tokens = tokenizer.tokenize(candicates) # Step 2: sentence embedding sentence_to_vector(question_tokens)
def w_tokenize(text): f = open( f'C:/Users/Jaroslav Marhivka/PycharmProjects/Text Generator/Text Generator/task/{text}', 'r', encoding='utf-8') pre_corpus = f.read() ws_tokenizer = WhitespaceTokenizer() corpus = ws_tokenizer.tokenize(pre_corpus) f.close() return corpus
def tokenize_english_document(input_text): """ This is a crude tokenizer for input conversations in English. :param input_text: :return: """ end_list = [] block_tokenizer = BlanklineTokenizer() sentence_tokenizer = PunktSentenceTokenizer() word_tokenizer = WhitespaceTokenizer() # using the 38 characters in one line rule from ITV subtitle guidelines characters_per_line = 38 lines_per_subtitle = 2 blocks = block_tokenizer.tokenize(input_text) for block in blocks: # We have one speaker sentences = sentence_tokenizer.tokenize(block) # We have the sentences for sentence in sentences: words = word_tokenizer.tokenize(sentence) reverse_words = words[::-1] lines = [] current_line = '' line_full = False while reverse_words: word = reverse_words.pop() longer_line = ' '.join([current_line, word]).strip() if len(longer_line) > characters_per_line and len( current_line): # The longer line is overreaching boundaries reverse_words.append(word) line_full = True elif len(word) >= characters_per_line: # Very long words current_line = longer_line line_full = True else: current_line = longer_line if line_full: lines.append(current_line) current_line = '' line_full = False if len(lines) >= lines_per_subtitle: end_list.append(lines) lines = [] if current_line: lines.append(current_line) if lines: end_list.append(lines) return end_list
def __init__(self): self.database_word_list_hobbies = [] self.database_word_list_location = [] self.database_word_list_occupation = [] self.database_word_list_institutions = [] self.location = [] self.hobbies = [] self.occupation = [] self.institution = [] self.whitespace_wt = WhitespaceTokenizer() self.emails = []
def main(): file_name = input() tokenizer = WhitespaceTokenizer() with open(file_name, "r", encoding="utf-8") as f: tokens = tokenizer.tokenize(f.read()) trigrms = list(trigrams(tokens)) trigrams_freq = defaultdict(Counter) for t in trigrms: trigrams_freq[f"{t[0]} {t[1]}"][t[2]] += 1 for _ in range(10): print(*generate_sentence(trigrams_freq))
def tokenization(): while True: prompt = input('>') if prompt == 'exit': exit() try: file = open(prompt, 'r', encoding='utf-8') # tokens = regexp_tokenize(file.read(), r"[\w!]+") tokens = WhitespaceTokenizer().tokenize(file.read()) token_count = len(tokens) unique_tokens_count = len(set(tokens)) print( f'Corpus statistics\nAll tokens: {token_count}\nUnique tokens: {unique_tokens_count}' ) break except FileNotFoundError: print('File not found in directory') while True: idx = input() if idx == 'exit': exit() try: print(tokens[int(idx)]) except TypeError: print('Type Error. Please input an integer.') except IndexError: print( 'Index Error. Please input an integer that is in the range of the corpus.' ) except ValueError: print('Value Error. Please input an integer.')
def hasTextYear(tpentity): #remove ending punctuation text1 = tpentity.getText().strip(",.") #replace all other punctuation and replace with spaces text = text1.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) #make sure it is all letters m = re.search('[a-z,A-Z,-,\s]*', text) if m.group(0) is not '': ##split on spaces tokenized_text = WhitespaceTokenizer().tokenize(text) for t in tokenized_text: if utils.getNumberFromText(t) is None: return False, None, None, None val = utils.getNumberFromText(text) if val is not None: if val >= 1500 and val <= 2050: r = re.search(text1, tpentity.getText()) start, end = r.span(0) return True, val, start, end else: return False, None, None, None else: return False, None, None, None return False, None, None, None
def post(self): """ Word tokenize a policy based on the WhitespaceTokenizer. Return an array of tokens """ args = document_parser.parse_args() tokenized = WhitespaceTokenizer().tokenize(text=args.document) return tokenized
def clean_text_col(self, text_col): text_col = text_col.apply( lambda text: WhitespaceTokenizer().tokenize(text)) text_col = text_col.apply(lambda sent: [word.lower() for word in sent]) text_col = text_col.apply( lambda sent: [word for word in sent if word not in stopwords]) text_col = text_col.apply(lambda sent: self.word_only(sent)) text_col = text_col.apply( lambda sent: [self.stemmer.stem(word) for word in sent]) return text_col
def redact_loc(contents): redacted = [] #list to return, holds redacted text nnp = [] #list to hold proper nouns redact = [] #list that holds words to redact #Tokenize text into sentences default_st = nltk.sent_tokenize sentences = default_st(text=contents) loc = [] loc_ = [] stop_words = set(stopwords.words('english')) ws = WhitespaceTokenizer() for sentence in sentences: #Tag GPE words in each sentence doc = nlp(sentence) for ent in doc.ents: if (ent.label_ == 'GPE'): loc.append(ent.text) #Strip stop words and separate by ws for l in loc: tokens = ws.tokenize(l) for t in tokens: if t not in stop_words: loc_.append(t) words = ws.tokenize(sentence) #Checks words to see if any are/contain GPE words for word in words: i = 0 while i < len(loc_): if (loc_[i] in word): redact.append(word) break else: i = i + 1 redacted = redact_items(redact, contents) return (redacted, redact)
def classify(self, booking): """ Classify booking and return prediction result :param booking: booking following BookingSchema in booking.py :return: category as string """ # check if creditor_id is already known category = self.match_creditor_id(booking) if category != -1: return str(category), "0" # check if creditor_id is in purpose code wst = WhitespaceTokenizer() tokens = wst.tokenize(booking.usage) try: print(tokens[tokens.index("Einreicher-ID") + 1]) booking.creditor_id = tokens[tokens.index("Einreicher-ID") + 1] except ValueError: print("No SEPA purpose code found") # start text analysis term_list = booking.text + ' ' + booking.usage + ' ' + booking.owner word_counts = self.feature_extractor.extract_termlist_features( term_list) predict_probabilities = self.clf.predict_proba(word_counts) #category = self.clf.predict(example_counts) # if max prediction probability is less than 70% assume that the booking category is unknown prob = str(max(max(predict_probabilities))) #print("P:" + str(prob)) #print("Highest ranked category: " + str(category_names[np.argmax(predict_probabilities)])) if max(max(predict_probabilities)) < 0.7: category = str(fbcat.SONSTIGES.name) # fallback category else: category = str(category_names[np.argmax(predict_probabilities)]) #print(category) return str(category), predict_probabilities
def redact_concepts(contents, concepts): synonyms = [] #will hold list of synonyms redacted = [] #return this; will hold redacted text redact = [] #holds list of words to redact #Makes list of synonyms of concept(s) for i in concepts: for syn in wordnet.synsets(i): for l in syn.lemma_names(): synonyms.append(l) #Tokenize text into sentences default_st = nltk.sent_tokenize sentences = default_st(text=contents) for sentence in sentences: #Tokenize sentence into words ws = WhitespaceTokenizer() words = ws.tokenize(sentence) has_syn = 0 #Checks words to see if any are/contain the synonyms for word in words: i = 0 while i < len(synonyms): if (synonyms[i] in word.casefold()): has_syn = 1 break else: i = i + 1 #If so, all words in sentence are to be redacted if (has_syn == 1): for word in words: redact.append(word) redacted = redact_items(redact, contents) return (redacted, redact)
def redact_numbers(contents): #List of numbers (spelled out) numbers = [ 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'teen', 'twenty', 'thirty', 'forty', 'fifty', 'hundred', 'thousand', 'million' ] redacted = [] #list to return, holds redacted tex redact = [] #list that holds words to redact #Tokenize text into sentences default_st = nltk.sent_tokenize sentences = default_st(text=contents) for sentence in sentences: #Tokenize sentence into words ws = WhitespaceTokenizer() words = ws.tokenize(sentence) #Ignores upper/lower case and punction marks for word in words: i = 0 while i < len(numbers): if (numbers[i] in word.casefold()): redact.append(word) break else: i = i + 1 #searches for digits in each word digits = re.findall(r'\d+', word) if digits: redact.append(word) redacted = redact_items(redact, contents) return (redacted, redact)
def count_frequency(self): self.__comments_dataframe = list_dataframe() read_comment_for_ngrams = self.__comments_dataframe[0] white_space_tokenize = WhitespaceTokenizer() #deixando todos os comentarios em um unico texto comment_ngrams = ' '.join([text for text in read_comment_for_ngrams["comments"]]) #separaremos todas as palavras comments_tokenize = white_space_tokenize.tokenize(comment_ngrams) #Calculamos a frequncia do texto freq_dist = FreqDist(comments_tokenize) df_freq_dist = pd.DataFrame({ "word": list(freq_dist.keys()), "frequency": list(freq_dist.values()) }) higher_frequency = df_freq_dist.sort_values(by="frequency", ascending=False).head(n=20) higher_frequency_dict = higher_frequency.to_dict(orient='list') self.__frequency["words"] = higher_frequency_dict["word"] self.__frequency["frequency"] = higher_frequency_dict["frequency"] return self.__frequency
def extract_text(adr): temp = load_json_list(adr) data = [] for item in temp: if 'extended_tweet' in item.keys(): text = item['extended_tweet']['full_text'] else: text = item['text'] temp = WhitespaceTokenizer.tokenize(text) if temp[0] != 'Wind': text = re.sub(r"\s+", " ", text) text = text.strip() data.append(text) return data
def redact_names(contents): redacted = [] #list to return, holds redacted text nnp = [] #list to hold proper nouns redact = [] #list that holds words to redact no_words = 0 #Tokenize text into sentences default_st = nltk.sent_tokenize sentences = default_st(text=contents) for sentence in sentences: #Tokenize sentence into words, tag words' pos ws = WhitespaceTokenizer() words = ws.tokenize(sentence) tagged = nltk.pos_tag(words) #Goes through each word to see if it's an NNP for word, tag in tagged: #If an NNP, add to nnp list if (tag == 'NNP'): nnp.append(word) #Checks words to see if any are/contain NNP words for word in words: i = 0 no_words = no_words + 1 while i < len(nnp): if (nnp[i] in word): redact.append(word) break else: i = i + 1 redacted = redact_items(redact, contents) return (redacted, redact, no_words)
class NonAlphaNumCharTokenizer(object): """ Replace the non alpha numeric character by space and tokenize the sentence by space. For example: the sentence 'hello world, org.eclipse.core.launcher.main.main' is tokenized to [hello, word , org, eclipse, core, launcher, main, main ]. """ REGEX = re.compile('[\W_]+', re.UNICODE) def __init__(self): self.tokenizer = WhitespaceTokenizer() def tokenize(self, text): text = re.sub(NonAlphaNumericalChar.REGEX, ' ', text) return self.tokenizer.tokenize(text)
def redact_items(red_list, contents): redacted = [] #list to return, holds redacted text #Tokenize text into sentences default_st = nltk.sent_tokenize sentences = default_st(text=contents) for sentence in sentences: #Tokenize sentence into words ws = WhitespaceTokenizer() words = ws.tokenize(sentence) #Goes through each word to see if it's in the redaction list for word in words: #Redact word if (word in red_list): r = '\u2588' * len(word) redacted.append(r) else: redacted.append(word) redacted = (' '.join(redacted)) return (redacted)
def doslogan4(): transcript = open('transcription.txt').read() words = WhitespaceTokenizer().tokenize(transcript) tagged = POS_tagger(words) tags = alltags(tagged) newlist = [] structure1 = [random.choice(syns), 'determiner', 'noun'] for index, item in enumerate(structure1): if item in tags: one = givemeone(item, tagged) newlist.append(one) else: newlist.append(item) print(' '.join(newlist))
def doslogan(structure): transcript = open('transcription.txt').read() words = WhitespaceTokenizer().tokenize(transcript) tagged = POS_tagger(words) tags = alltags(tagged) newlist = [] for index, item in enumerate(structure): if item in tags: one = givemeone(item, tagged) newlist.append(one) else: newlist.append(item) cprint(' '.join(newlist), random.choice(color), random.choice(on_color))
def ratio(tweets): nom = ["aint", "ain’t"] denom = ['isn’t', 'aren’t', 'wasn’t', 'weren’t', 'haven’t', 'hasn’t', 'hadn’t', 'isnt', 'arent', 'wasnt', 'werent', 'hasnt', 'havent', 'hadnt', 'is not', 'are not', 'was not', 'were not', 'have not', 'has not', 'had not'] nom_counter, denom_counter = 0, 0 for tweet in tweets: tokens = WhitespaceTokenizer().tokenize(tweet) for item in tokens: if item in nom: nom_counter+=1 if item in denom: denom_counter+=1 if denom_counter == 0: r = nom_counter else: r = round(nom_counter/denom_counter, 4) return r, nom_counter, denom_counter
class GeneralTokenizer(Tokenizer): def __init__(self): self.tokenizer = WhitespaceTokenizer() def normalize(self, text): return ' '.join(self.tokenize(text)) def tokenize(self, text): result = [] if type(text) is not unicode: if type(text) in (int, float): text = str(text) text = unicode(text, 'utf-8', errors='ignore') # pre tokenize for word in self.tokenizer.tokenize(text): word = word.strip(string.punctuation).lower() if word.endswith("'s") or word.endswith(u"’s"): word = word[:-2] if word and word.strip(): result.append(word) return result
def whitespace_tokenize(self, text): wst = WhitespaceTokenizer() return wst.tokenize(text)
"""Forth stage of the 'Text Generator' project. We are taking text corpus as an input, then create bigrams from the tokenized corpus, sort them in 'freq_dict' dictionary with heads as keys, and list of tails as values. After that we randomly choose the first word of the sentence, and the second word will be predicted by looking up the first word of the chain in the model and choosing the most probable next word from the set of possible follow-ups. This step is repeated 10 times (10 words in 1 sentence). We also print 10 sentences. """ from nltk import WhitespaceTokenizer from nltk.util import bigrams from collections import Counter import random """First we open file and tokenize it""" with open(input(), 'r', encoding='utf-8') as file: corpus = WhitespaceTokenizer().tokenize(file.read()) """Then we make a bigram and organize a dictionary in fallowing manner: key is a head of bigram, and values are all the tails for it. """ my_bigrams = list(bigrams(corpus)) bigrams_dict = {} for head, tail in my_bigrams: bigrams_dict.setdefault(head, []).append(tail) """Then we create new dictionary: key is a head, and values is a dictionary, with tails as keys and their count as values """ freq_dict = {} for head, tails in bigrams_dict.items(): freq_dict[head] = Counter(tails)
lines_new = lines[startIndex:endIndex] tranche1 = lines_new[getIndex('Tranche 1', lines_new)] tranche2 = lines_new[getIndex('Tranche 2', lines_new)] tranche3 = lines_new[getIndex('Tranche 3', lines_new)] contract4 = lines_new[getIndex('Contract 4', lines_new)] contracts = [tranche1, tranche2, tranche3, contract4] for contract in contracts: # Tokenize the lines default_st = nltk.sent_tokenize sentences = default_st(text=contract) for sentence in sentences: # print(sentence) # Tokenize sentence into words, tag words' pos ws = WhitespaceTokenizer() words = ws.tokenize(sentence) #print(words) # Read contents into df df['Field'] = [field] df['Contract'] = [words[0] + ' ' + words[1]] df['Date'] = [date] df['SalesGas_mmscf'] = [words[3]] df['Cond/Oil_bbl'] = [words[4]] mother_df = mother_df.append(other=df, ignore_index=True) except TypeError: print('error: ' + f)
def main(_run, _config, _seed, _log): """ :param _run: :param _config: :param _seed: :param _log: :return: """ """ Setting and loading parameters """ # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) bugReportDatabase = BugReportDatabase.fromJson(args['bug_database']) paddingSym = "</s>" batchSize = args['batch_size'] device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # It is the folder where the preprocessed information will be stored. cacheFolder = args['cache_folder'] # Setting the parameter to save and loading parameters importantParameters = ['compare_aggregation', 'categorical'] parametersToSave = dict([(parName, args[parName]) for parName in importantParameters]) if args['load'] is not None: mapLocation = ( lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu' modelInfo = torch.load(args['load'], map_location=mapLocation) modelState = modelInfo['model'] for paramName, paramValue in modelInfo['params'].items(): args[paramName] = paramValue else: modelState = None preprocessors = PreprocessorList() inputHandlers = [] categoricalOpt = args.get('categorical') if categoricalOpt is not None and len(categoricalOpt) != 0: categoricalEncoder, _, _ = processCategoricalParam( categoricalOpt, bugReportDatabase, inputHandlers, preprocessors, None, logger) else: categoricalEncoder = None filterInputHandlers = [] compareAggOpt = args['compare_aggregation'] databasePath = args['bug_database'] # Loading word embedding if compareAggOpt["lexicon"]: emb = np.load(compareAggOpt["word_embedding"]) lexicon = Lexicon(unknownSymbol=None) with codecs.open(compareAggOpt["lexicon"]) as f: for l in f: lexicon.put(l.strip()) lexicon.setUnknown("UUUKNNN") paddingId = lexicon.getLexiconIndex(paddingSym) embedding = Embedding(lexicon, emb, paddingIdx=paddingId) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) elif compareAggOpt["word_embedding"]: # todo: Allow use embeddings and other representation lexicon, embedding = Embedding.fromFile( compareAggOpt['word_embedding'], 'UUUKNNN', hasHeader=False, paddingSym=paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) else: embedding = None if compareAggOpt["norm_word_embedding"]: embedding.zscoreNormalization() # Tokenizer if compareAggOpt['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif compareAggOpt['tokenizer'] == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % compareAggOpt['tokenizer']) # Preparing input handlers, preprocessors and cache minSeqSize = max(compareAggOpt['aggregate']["window"] ) if compareAggOpt['aggregate']["model"] == "cnn" else -1 bow = compareAggOpt.get('bow', False) freq = compareAggOpt.get('frequency', False) and bow logger.info("BoW={} and TF={}".format(bow, freq)) if compareAggOpt['extractor'] is not None: # Use summary and description (concatenated) to address this problem logger.info("Using Summary and Description information.") # Loading Filters extractorFilters = loadFilters(compareAggOpt['extractor']['filters']) arguments = (databasePath, compareAggOpt['word_embedding'], str(compareAggOpt['lexicon']), ' '.join( sorted([ fil.__class__.__name__ for fil in extractorFilters ])), compareAggOpt['tokenizer'], str(bow), str(freq), SABDEncoderPreprocessor.__name__) inputHandlers.append(SABDInputHandler(paddingId, minSeqSize)) extractorCache = PreprocessingCache(cacheFolder, arguments) if bow: extractorPreprocessor = SABDBoWPreprocessor( lexicon, bugReportDatabase, extractorFilters, tokenizer, paddingId, freq, extractorCache) else: extractorPreprocessor = SABDEncoderPreprocessor( lexicon, bugReportDatabase, extractorFilters, tokenizer, paddingId, extractorCache) preprocessors.append(extractorPreprocessor) # Create model model = SABD(embedding, categoricalEncoder, compareAggOpt['extractor'], compareAggOpt['matching'], compareAggOpt['aggregate'], compareAggOpt['classifier'], freq) if args['loss'] == 'bce': logger.info("Using BCE Loss: margin={}".format(args['margin'])) lossFn = BCELoss() lossNoReduction = BCELoss(reduction='none') cmp_collate = PairBugCollate(inputHandlers, torch.float32, unsqueeze_target=True) elif args['loss'] == 'triplet': logger.info("Using Triplet Loss: margin={}".format(args['margin'])) lossFn = TripletLoss(args['margin']) lossNoReduction = TripletLoss(args['margin'], reduction='none') cmp_collate = TripletBugCollate(inputHandlers) model.to(device) if modelState: model.load_state_dict(modelState) """ Loading the training and validation. Also, it sets how the negative example will be generated. """ # load training if args.get('pairs_training'): negativePairGenOpt = args.get('neg_pair_generator', ) trainingFile = args.get('pairs_training') offlineGeneration = not (negativePairGenOpt is None or negativePairGenOpt['type'] == 'none') masterIdByBugId = bugReportDatabase.getMasterIdByBugId() randomAnchor = negativePairGenOpt['random_anchor'] if not offlineGeneration: logger.info("Not generate dynamically the negative examples.") negativePairGenerator = None else: pairGenType = negativePairGenOpt['type'] if pairGenType == 'random': logger.info("Random Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = RandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, randomAnchor=randomAnchor) elif pairGenType == 'non_negative': logger.info("Non Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = NonNegativeRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'misc_non_zero': logger.info("Misc Non Zero Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = MiscNonZeroRandomGen( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, trainingDataset.duplicateIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'product_component': logger.info("Product Component Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = ProductComponentRandomGen( bugReportDatabase, preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'random_k': logger.info("Random K Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = KRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['k'], device, randomAnchor=randomAnchor) elif pairGenType == "pre": logger.info("Pre-selected list generator") negativePairGenerator = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "positive_pre": logger.info("Positive Pre-selected list generator") negativePairGenerator = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmp_collate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "misc_non_zero_pre": logger.info("Misc: non-zero and Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) elif pairGenType == "misc_non_zero_positive_pre": logger.info( "Misc: non-zero and Positive Pre-selected list generator") negativePairGenerator1 = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmp_collate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pairGenType) if isinstance(lossFn, BCELoss): training_reader = PairBugDatasetReader( trainingFile, preprocessors, negativePairGenerator, randomInvertPair=args['random_switch']) elif isinstance(lossFn, TripletLoss): training_reader = TripletBugDatasetReader( trainingFile, preprocessors, negativePairGenerator, randomInvertPair=args['random_switch']) trainingLoader = DataLoader(training_reader, batch_size=batchSize, collate_fn=cmp_collate.collate, shuffle=True) logger.info("Training size: %s" % (len(trainingLoader.dataset))) # load validation if args.get('pairs_validation'): if isinstance(lossFn, BCELoss): validation_reader = PairBugDatasetReader( args.get('pairs_validation'), preprocessors) elif isinstance(lossFn, TripletLoss): validation_reader = TripletBugDatasetReader( args.get('pairs_validation'), preprocessors) validationLoader = DataLoader(validation_reader, batch_size=batchSize, collate_fn=cmp_collate.collate) logger.info("Validation size: %s" % (len(validationLoader.dataset))) else: validationLoader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate rankingScorer = GeneralScorer( model, preprocessors, device, PairBugCollate(inputHandlers, ignore_target=True), args['ranking_batch_size'], args['ranking_n_workers']) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt, args['sample_size_rr_tr']) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselectListRanking = PreselectListRanking(recallEstimationOpt, args['sample_size_rr_val']) # LR scheduler lrSchedulerOpt = args.get('lr_scheduler', None) if lrSchedulerOpt is None: logger.info("Scheduler: Constant") lrSched = None elif lrSchedulerOpt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lrSchedulerOpt["step_size"], args["decay"])) lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"], lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lrSchedulerOpt["decay"])) lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lrSchedulerOpt["decay"])) lrDecay = lrSchedulerOpt["decay"] lrSched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pairGenType) # Set training functions def trainingIteration(engine, batch): engine.kk = 0 model.train() optimizer.zero_grad() x, y = cmp_collate.to(batch, device) output = model(*x) loss = lossFn(output, y) loss.backward() optimizer.step() return loss, output, y def scoreDistanceTrans(output): if len(output) == 3: _, y_pred, y = output else: y_pred, y = output if lossFn == F.nll_loss: return torch.exp(y_pred[:, 1]), y elif isinstance(lossFn, (BCELoss)): return y_pred, y trainer = Engine(trainingIteration) trainingMetrics = {'training_loss': AverageLoss(lossFn)} if isinstance(lossFn, BCELoss): trainingMetrics['training_dist_target'] = MeanScoreDistance( output_transform=scoreDistanceTrans) trainingMetrics['training_acc'] = AccuracyWrapper( output_transform=thresholded_output_transform) trainingMetrics['training_precision'] = PrecisionWrapper( output_transform=thresholded_output_transform) trainingMetrics['training_recall'] = RecallWrapper( output_transform=thresholded_output_transform) # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): if not hasattr(engine, 'kk'): engine.kk = 0 model.eval() with torch.no_grad(): x, y = cmp_collate.to(batch, device) y_pred = model(*x) return y_pred, y validationMetrics = { 'validation_loss': LossWrapper(lossFn, output_transform=lambda x: (x[0], x[0][0]) if x[1] is None else x) } if isinstance(lossFn, BCELoss): validationMetrics['validation_dist_target'] = MeanScoreDistance( output_transform=scoreDistanceTrans) validationMetrics['validation_acc'] = AccuracyWrapper( output_transform=thresholded_output_transform) validationMetrics['validation_precision'] = PrecisionWrapper( output_transform=thresholded_output_transform) validationMetrics['validation_recall'] = RecallWrapper( output_transform=thresholded_output_transform) evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) # recommendation recommendation_fn = generateRecommendationList @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lrSched: lrSched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validationLoader: evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, epoch) lastEpoch = args['epochs'] - epoch == 0 if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, rankingScorer, bugReportDatabase, None, epoch, "train", recommendationListfn=recommendation_fn) rankingScorer.free() if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), epoch, "validation", recommendationListfn=recommendation_fn) rankingScorer.free() if not lastEpoch: training_reader.sampleNewNegExamples(model, lossNoReduction) if args.get('save'): save_by_epoch = args['save_by_epoch'] if save_by_epoch and epoch in save_by_epoch: file_name, file_extension = os.path.splitext(args['save']) file_path = file_name + '_epoch_{}'.format( epoch) + file_extension else: file_path = args['save'] modelInfo = { 'model': model.state_dict(), 'params': parametersToSave } logger.info("==> Saving Model: %s" % file_path) torch.save(modelInfo, file_path) if args.get('pairs_training'): trainer.run(trainingLoader, max_epochs=args['epochs']) elif args.get('pairs_validation'): # Evaluate Training evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, 0) if recallEstimationOpt: logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), 0, "validation", recommendationListfn=recommendation_fn) # Test Dataset (accuracy, recall, precision, F1) pair_test_dataset = args.get('pair_test_dataset') if pair_test_dataset is not None and len(pair_test_dataset) > 0: pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors) testLoader = DataLoader(pairTestReader, batch_size=batchSize, collate_fn=cmp_collate.collate) if not isinstance(cmp_collate, PairBugCollate): raise NotImplementedError( 'Evaluation of pairs using tanh was not implemented yet') logger.info("Test size: %s" % (len(testLoader.dataset))) testMetrics = { 'test_accuracy': ignite.metrics.Accuracy( output_transform=thresholded_output_transform), 'test_precision': ignite.metrics.Precision( output_transform=thresholded_output_transform), 'test_recall': ignite.metrics.Recall( output_transform=thresholded_output_transform), 'test_predictions': PredictionCache(), } test_evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in testMetrics.items(): metric.attach(test_evaluator, name) test_evaluator.run(testLoader) for metricName, metricValue in test_evaluator.state.metrics.items(): metric = testMetrics[metricName] if isinstance(metric, ignite.metrics.Accuracy): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'correct': metric._num_correct, 'total': metric._num_examples }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, (ignite.metrics.Precision, ignite.metrics.Recall)): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'tp': metric._true_positives.item(), 'total_positive': metric._positives.item() }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, ConfusionMatrix): acc = cmAccuracy(metricValue) prec = cmPrecision(metricValue, False) recall = cmRecall(metricValue, False) f1 = 2 * (prec * recall) / (prec + recall + 1e-15) logger.info({ 'type': 'metric', 'label': metricName, 'accuracy': np.float(acc), 'precision': prec.cpu().numpy().tolist(), 'recall': recall.cpu().numpy().tolist(), 'f1': f1.cpu().numpy().tolist(), 'confusion_matrix': metricValue.cpu().numpy().tolist(), 'epoch': None }) _run.log_scalar('test_f1', f1[1]) elif isinstance(metric, PredictionCache): logger.info({ 'type': 'metric', 'label': metricName, 'predictions': metric.predictions }) # Calculate recall rate recallRateOpt = args.get('recall_rate', {'type': 'none'}) if recallRateOpt['type'] != 'none': if recallRateOpt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = SunRanking(bugReportDatabase, recallRateDataset, recallRateOpt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recallRateOpt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = DeshmukhRanking(bugReportDatabase, recallRateDataset) group_by_master = recallRateOpt['group_by_master'] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recallRateOpt['type']) logRankingResult(_run, logger, rankingClass, rankingScorer, bugReportDatabase, recallRateOpt["result_file"], 0, None, group_by_master, recommendationListfn=recommendation_fn)
from bs4 import BeautifulSoup import re from nltk import ngrams import csv german_stopwords = stopwords.words('german') english_stopwords = stopwords.words("english") #https://www.pc-erfahrung.de/nebenrubriken/sonstiges/webdesignwebentwicklung/stoppwortliste.html browser = webdriver.Chrome('/home/marco/Downloads/chromedriver') browser.implicitly_wait(10) stemmer = SnowballStemmer('german') stemmed_words = [] all_bigrams = [] whitespace_wt = WhitespaceTokenizer() def compare_text_with_keywords(html_string): all_formatted_words = [] all_stemmed_words = [] # to split a string at uppercase, MarcoLang will be Marco Lang splitted_string_by_uppercase = re.findall('[A-Z][^A-Z]*', html_string) string = str(splitted_string_by_uppercase) # formatted_string = string.replace(","," ").replace("'","").replace("[","").replace("]","").replace("\\n","") formatted_string = string.replace("\\n", "") # to delete all line breaks splitted_words = word_tokenize(formatted_string.lower()) for word in splitted_words: formatted_word = ''.join(e for e in word if (e.isalnum())) if formatted_word is not '':
import pymysql import math from nltk import WhitespaceTokenizer connection = pymysql.connect(host="127.0.0.1", user="******", password="******", charset='utf8', db='tf-idf', cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() tokenizer = WhitespaceTokenizer() sql_tf = 'SELECT * FROM wiki WHERE id=%s' sql_idf = 'SELECT COUNT(*) count FROM inverted_index WHERE term=%s' def tf_idf(id, term): cursor.execute(sql_tf, id) text = cursor.fetchone()['text'] words = tokenizer.tokenize(text) words = list(map(lambda x: x.lower(), words)) tf = math.log(1 + words.count(term) / len(words)) cursor.execute(sql_idf, term) idf = 1 / cursor.fetchone()['count'] print('TF-IDF of the term ' + term + ' in ID=' + str(id) + ': ' + str(tf * idf)) terms = [(41631770, 'also'), (6688599, 'debut'), (13794826, 'language')]
from collections import defaultdict, Counter from nltk import WhitespaceTokenizer, bigrams from random import choice, choices bigrams_list = [] bigrams_dict = defaultdict(Counter) with open(input(), 'r', encoding='utf-8') as file: for line in file: bigrams_list.extend(bigrams(WhitespaceTokenizer().tokenize(line))) for head, tail in bigrams_list: bigrams_dict[head][tail] += 1 heads = list(bigrams_dict.keys()) first_word_set = set() while len(first_word_set) < 10: word = choice(heads) if word[0].isupper() and word[-1] not in ".!?": first_word_set.add(word) for word in first_word_set: sentence = [word] while len(sentence) < 5 or sentence[-1][-1] not in ".!?": next_words = list(bigrams_dict[sentence[-1]].keys()) next_words_count = list(bigrams_dict[sentence[-1]].values()) sentence.extend(choices(next_words, weights=next_words_count)) print(" ".join(sentence))