예제 #1
0
def redact_gender(contents):

    #List of gendered words
    genders = [
        'he', "hes", 'her', 'she', "shes", 'him', 'his', 'woman', 'man',
        'lady', 'ladies', 'girl', 'boy', 'women', 'men', 'son', "son's",
        'daughter', "daughters", 'father', "fathers", 'mother', 'sister',
        'brother', 'herself', 'himself', "mothers", 'female', 'male'
    ]

    redacted = []  #list to return, holds redacted tex
    redact = []  #list that holds words to redact

    #Tokenize text into sentences
    default_st = nltk.sent_tokenize
    sentences = default_st(text=contents)

    for sentence in sentences:
        #Tokenize sentence into words
        ws = WhitespaceTokenizer()
        words = ws.tokenize(sentence)

        #Ignores upper/lower case and punction marks
        for word in words:
            w = unicodedata.normalize('NFKD',
                                      word).encode('ascii',
                                                   'ignore').decode('utf8')
            if ((w.translate(str.maketrans('', '',
                                           string.punctuation)).casefold()
                 in genders)):
                redact.append(word)

    redacted = redact_items(redact, contents)
    return (redacted, redact)
예제 #2
0
def clean_review(review: str) -> str:
    """Clean a review of unnecessary symbols."""
    stop_words = set(stopwords.words("english"))
    tokenizer = WhitespaceTokenizer()
    lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(word: str) -> str:
        """Get wordnet pos (part of speech) tags."""
        word_and_tag = pos_tag([word])[0]
        tag = word_and_tag[1]
        short_tag = tag[0]
        tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV,
        }
        return tag_dict.get(short_tag, wordnet.NOUN)

    review = re.sub("<.*?>", " ", review)
    review = review.translate(
        str.maketrans(string.punctuation + string.digits, 42 * " ")
    )
    review_tokens = tokenizer.tokenize(review)
    lower_review_tokens = (token.lower() for token in review_tokens)
    review_tokens = (token for token in lower_review_tokens if token not in stop_words)
    review_lemmas = (
        lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in review_tokens
    )
    return " ".join(review_lemmas)
예제 #3
0
def main(question: str, candicates: List[str]) -> None:
    init()

    # Step 1: 分词
    tokenizer = WhitespaceTokenizer()
    question_tokens = tokenizer.tokenize(question)
    candicates_tokens = tokenizer.tokenize(candicates)

    # Step 2: sentence embedding
    sentence_to_vector(question_tokens)
예제 #4
0
 def w_tokenize(text):
     f = open(
         f'C:/Users/Jaroslav Marhivka/PycharmProjects/Text Generator/Text Generator/task/{text}',
         'r',
         encoding='utf-8')
     pre_corpus = f.read()
     ws_tokenizer = WhitespaceTokenizer()
     corpus = ws_tokenizer.tokenize(pre_corpus)
     f.close()
     return corpus
예제 #5
0
def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(
                        current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
 def __init__(self):
     self.database_word_list_hobbies = []
     self.database_word_list_location = []
     self.database_word_list_occupation = []
     self.database_word_list_institutions = []
     self.location = []
     self.hobbies = []
     self.occupation = []
     self.institution = []
     self.whitespace_wt = WhitespaceTokenizer()
     self.emails = []
예제 #7
0
def main():
    file_name = input()
    tokenizer = WhitespaceTokenizer()
    with open(file_name, "r", encoding="utf-8") as f:
        tokens = tokenizer.tokenize(f.read())
    trigrms = list(trigrams(tokens))
    trigrams_freq = defaultdict(Counter)
    for t in trigrms:
        trigrams_freq[f"{t[0]} {t[1]}"][t[2]] += 1

    for _ in range(10):
        print(*generate_sentence(trigrams_freq))
예제 #8
0
def tokenization():
    while True:
        prompt = input('>')

        if prompt == 'exit':
            exit()
        try:
            file = open(prompt, 'r', encoding='utf-8')
            # tokens = regexp_tokenize(file.read(), r"[\w!]+")
            tokens = WhitespaceTokenizer().tokenize(file.read())
            token_count = len(tokens)
            unique_tokens_count = len(set(tokens))
            print(
                f'Corpus statistics\nAll tokens: {token_count}\nUnique tokens: {unique_tokens_count}'
            )
            break
        except FileNotFoundError:
            print('File not found in directory')

    while True:
        idx = input()

        if idx == 'exit':
            exit()
        try:
            print(tokens[int(idx)])
        except TypeError:
            print('Type Error. Please input an integer.')
        except IndexError:
            print(
                'Index Error. Please input an integer that is in the range of the corpus.'
            )
        except ValueError:
            print('Value Error. Please input an integer.')
예제 #9
0
def hasTextYear(tpentity):
    #remove ending punctuation
    text1 = tpentity.getText().strip(",.")
    #replace all other punctuation and replace with spaces
    text = text1.translate(
        str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    #make sure it is all letters
    m = re.search('[a-z,A-Z,-,\s]*', text)
    if m.group(0) is not '':
        ##split on spaces
        tokenized_text = WhitespaceTokenizer().tokenize(text)
        for t in tokenized_text:
            if utils.getNumberFromText(t) is None:
                return False, None, None, None
        val = utils.getNumberFromText(text)

        if val is not None:
            if val >= 1500 and val <= 2050:
                r = re.search(text1, tpentity.getText())
                start, end = r.span(0)
                return True, val, start, end
            else:
                return False, None, None, None
        else:
            return False, None, None, None
    return False, None, None, None
예제 #10
0
 def post(self):
     """
     Word tokenize a policy based on the WhitespaceTokenizer.
     Return an array of tokens
     """
     args = document_parser.parse_args()
     tokenized = WhitespaceTokenizer().tokenize(text=args.document)
     return tokenized
예제 #11
0
 def clean_text_col(self, text_col):
     text_col = text_col.apply(
         lambda text: WhitespaceTokenizer().tokenize(text))
     text_col = text_col.apply(lambda sent: [word.lower() for word in sent])
     text_col = text_col.apply(
         lambda sent: [word for word in sent if word not in stopwords])
     text_col = text_col.apply(lambda sent: self.word_only(sent))
     text_col = text_col.apply(
         lambda sent: [self.stemmer.stem(word) for word in sent])
     return text_col
예제 #12
0
def redact_loc(contents):

    redacted = []  #list to return, holds redacted text
    nnp = []  #list to hold proper nouns
    redact = []  #list that holds words to redact

    #Tokenize text into sentences
    default_st = nltk.sent_tokenize
    sentences = default_st(text=contents)

    loc = []
    loc_ = []
    stop_words = set(stopwords.words('english'))
    ws = WhitespaceTokenizer()

    for sentence in sentences:
        #Tag GPE words in each sentence
        doc = nlp(sentence)
        for ent in doc.ents:
            if (ent.label_ == 'GPE'):
                loc.append(ent.text)
                #Strip stop words and separate by ws
                for l in loc:
                    tokens = ws.tokenize(l)
                    for t in tokens:
                        if t not in stop_words:
                            loc_.append(t)

        words = ws.tokenize(sentence)
        #Checks words to see if any are/contain GPE words
        for word in words:
            i = 0
            while i < len(loc_):
                if (loc_[i] in word):
                    redact.append(word)
                    break
                else:
                    i = i + 1

    redacted = redact_items(redact, contents)
    return (redacted, redact)
예제 #13
0
    def classify(self, booking):
        """
        Classify booking and return prediction result
        :param booking: booking following BookingSchema in booking.py
        :return: category as string
        """
        # check if creditor_id is already known
        category = self.match_creditor_id(booking)
        if category != -1:
            return str(category), "0"

        # check if creditor_id is in purpose code
        wst = WhitespaceTokenizer()
        tokens = wst.tokenize(booking.usage)
        try:
            print(tokens[tokens.index("Einreicher-ID") + 1])
            booking.creditor_id = tokens[tokens.index("Einreicher-ID") + 1]
        except ValueError:
            print("No SEPA purpose code found")

        # start text analysis
        term_list = booking.text + ' ' + booking.usage + ' ' + booking.owner
        word_counts = self.feature_extractor.extract_termlist_features(
            term_list)
        predict_probabilities = self.clf.predict_proba(word_counts)
        #category = self.clf.predict(example_counts)

        # if max prediction probability is less than 70% assume that the booking category is unknown
        prob = str(max(max(predict_probabilities)))
        #print("P:" + str(prob))
        #print("Highest ranked category: " + str(category_names[np.argmax(predict_probabilities)]))

        if max(max(predict_probabilities)) < 0.7:
            category = str(fbcat.SONSTIGES.name)  # fallback category
        else:
            category = str(category_names[np.argmax(predict_probabilities)])

        #print(category)
        return str(category), predict_probabilities
예제 #14
0
def redact_concepts(contents, concepts):

    synonyms = []  #will hold list of synonyms
    redacted = []  #return this; will hold redacted text
    redact = []  #holds list of words to redact

    #Makes list of synonyms of concept(s)
    for i in concepts:
        for syn in wordnet.synsets(i):
            for l in syn.lemma_names():
                synonyms.append(l)

    #Tokenize text into sentences
    default_st = nltk.sent_tokenize
    sentences = default_st(text=contents)

    for sentence in sentences:
        #Tokenize sentence into words
        ws = WhitespaceTokenizer()
        words = ws.tokenize(sentence)

        has_syn = 0
        #Checks words to see if any are/contain the synonyms
        for word in words:
            i = 0
            while i < len(synonyms):
                if (synonyms[i] in word.casefold()):
                    has_syn = 1
                    break
                else:
                    i = i + 1

        #If so, all words in sentence are to be redacted
        if (has_syn == 1):
            for word in words:
                redact.append(word)

    redacted = redact_items(redact, contents)
    return (redacted, redact)
예제 #15
0
def redact_numbers(contents):

    #List of numbers (spelled out)
    numbers = [
        'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
        'ten', 'eleven', 'twelve', 'teen', 'twenty', 'thirty', 'forty',
        'fifty', 'hundred', 'thousand', 'million'
    ]

    redacted = []  #list to return, holds redacted tex
    redact = []  #list that holds words to redact

    #Tokenize text into sentences
    default_st = nltk.sent_tokenize
    sentences = default_st(text=contents)

    for sentence in sentences:
        #Tokenize sentence into words
        ws = WhitespaceTokenizer()
        words = ws.tokenize(sentence)

        #Ignores upper/lower case and punction marks
        for word in words:
            i = 0
            while i < len(numbers):
                if (numbers[i] in word.casefold()):
                    redact.append(word)
                    break
                else:
                    i = i + 1

            #searches for digits in each word
            digits = re.findall(r'\d+', word)
            if digits:
                redact.append(word)

    redacted = redact_items(redact, contents)
    return (redacted, redact)
예제 #16
0
    def count_frequency(self):
        self.__comments_dataframe = list_dataframe()
        read_comment_for_ngrams =  self.__comments_dataframe[0]
        white_space_tokenize = WhitespaceTokenizer()

        #deixando todos os comentarios em um unico texto
        comment_ngrams = ' '.join([text for text in read_comment_for_ngrams["comments"]])
        #separaremos todas as palavras
        comments_tokenize = white_space_tokenize.tokenize(comment_ngrams)

        #Calculamos a frequncia do texto
        freq_dist = FreqDist(comments_tokenize)

        df_freq_dist = pd.DataFrame({
            "word": list(freq_dist.keys()),
            "frequency": list(freq_dist.values())
        })

        higher_frequency = df_freq_dist.sort_values(by="frequency", ascending=False).head(n=20)
        higher_frequency_dict = higher_frequency.to_dict(orient='list')
        self.__frequency["words"] = higher_frequency_dict["word"]
        self.__frequency["frequency"] = higher_frequency_dict["frequency"]
        return self.__frequency
def extract_text(adr):
    temp = load_json_list(adr)
    data = []
    for item in temp:
        if 'extended_tweet' in item.keys():
            text = item['extended_tweet']['full_text']
        else:
            text = item['text']
        temp = WhitespaceTokenizer.tokenize(text)
        if temp[0] != 'Wind':
            text = re.sub(r"\s+", " ", text)
            text = text.strip()
            data.append(text)
    return data
예제 #18
0
def redact_names(contents):

    redacted = []  #list to return, holds redacted text
    nnp = []  #list to hold proper nouns
    redact = []  #list that holds words to redact

    no_words = 0

    #Tokenize text into sentences
    default_st = nltk.sent_tokenize
    sentences = default_st(text=contents)

    for sentence in sentences:
        #Tokenize sentence into words, tag words' pos
        ws = WhitespaceTokenizer()
        words = ws.tokenize(sentence)
        tagged = nltk.pos_tag(words)

        #Goes through each word to see if it's an NNP
        for word, tag in tagged:
            #If an NNP, add to nnp list
            if (tag == 'NNP'):
                nnp.append(word)

        #Checks words to see if any are/contain NNP words
        for word in words:
            i = 0
            no_words = no_words + 1
            while i < len(nnp):
                if (nnp[i] in word):
                    redact.append(word)
                    break
                else:
                    i = i + 1

    redacted = redact_items(redact, contents)
    return (redacted, redact, no_words)
예제 #19
0
class NonAlphaNumCharTokenizer(object):
    """
    Replace the non alpha numeric character by space and tokenize the sentence by space.
    For example: the sentence 'hello world, org.eclipse.core.launcher.main.main' is tokenized to
    [hello, word , org, eclipse, core, launcher, main, main ].
    """
    REGEX = re.compile('[\W_]+', re.UNICODE)

    def __init__(self):
        self.tokenizer = WhitespaceTokenizer()

    def tokenize(self, text):
        text = re.sub(NonAlphaNumericalChar.REGEX, ' ', text)

        return self.tokenizer.tokenize(text)
예제 #20
0
def redact_items(red_list, contents):

    redacted = []  #list to return, holds redacted text

    #Tokenize text into sentences
    default_st = nltk.sent_tokenize
    sentences = default_st(text=contents)

    for sentence in sentences:
        #Tokenize sentence into words
        ws = WhitespaceTokenizer()
        words = ws.tokenize(sentence)

        #Goes through each word to see if it's in the redaction list
        for word in words:
            #Redact word
            if (word in red_list):
                r = '\u2588' * len(word)
                redacted.append(r)
            else:
                redacted.append(word)

    redacted = (' '.join(redacted))
    return (redacted)
예제 #21
0
def doslogan4():
    transcript = open('transcription.txt').read()
    words = WhitespaceTokenizer().tokenize(transcript)
    tagged = POS_tagger(words)

    tags = alltags(tagged)

    newlist = []
    structure1 = [random.choice(syns), 'determiner', 'noun']
    for index, item in enumerate(structure1):
        if item in tags:
            one = givemeone(item, tagged)
            newlist.append(one)
        else:
            newlist.append(item)
    print(' '.join(newlist))
예제 #22
0
def doslogan(structure):
    transcript = open('transcription.txt').read()
    words = WhitespaceTokenizer().tokenize(transcript)
    tagged = POS_tagger(words)

    tags = alltags(tagged)

    newlist = []

    for index, item in enumerate(structure):
        if item in tags:
            one = givemeone(item, tagged)
            newlist.append(one)
        else:
            newlist.append(item)
    cprint(' '.join(newlist), random.choice(color), random.choice(on_color))
def ratio(tweets):
    
    nom = ["aint", "ain’t"] 
    denom = ['isn’t', 'aren’t', 'wasn’t', 'weren’t', 'haven’t', 
             'hasn’t', 'hadn’t', 'isnt', 'arent', 'wasnt', 'werent',
             'hasnt', 'havent', 'hadnt', 'is not', 'are not', 'was not',
             'were not', 'have not', 'has not', 'had not']
    nom_counter, denom_counter = 0, 0
    for tweet in tweets:
        tokens = WhitespaceTokenizer().tokenize(tweet)
        for item in tokens:
            if item in nom:
                nom_counter+=1
            if item in denom:
                denom_counter+=1
    if denom_counter == 0:
        r = nom_counter
    else:
        r = round(nom_counter/denom_counter, 4)
    
    return r, nom_counter, denom_counter
class GeneralTokenizer(Tokenizer):
    def __init__(self):
        self.tokenizer = WhitespaceTokenizer()

    def normalize(self, text):
        return ' '.join(self.tokenize(text))

    def tokenize(self, text):
        result = []
        if type(text) is not unicode:
            if type(text) in (int, float):
                text = str(text)
            text = unicode(text, 'utf-8', errors='ignore')

        # pre tokenize
        for word in self.tokenizer.tokenize(text):
            word = word.strip(string.punctuation).lower()
            if word.endswith("'s") or word.endswith(u"’s"):
                word = word[:-2]

            if word and word.strip():
                result.append(word)
        return result
예제 #25
0
	def whitespace_tokenize(self, text):
		wst = WhitespaceTokenizer()
		return wst.tokenize(text)
예제 #26
0
"""Forth stage of the 'Text Generator' project.
We are taking text corpus as an input, then create
bigrams from the tokenized corpus, sort them in 'freq_dict' dictionary with
heads as keys, and list of tails as values. After that we randomly choose
the first word of the sentence, and the second word will be predicted by
looking up the first word of the chain in the model and choosing the most
probable next word from the set of possible follow-ups. This step is repeated
10 times (10 words in 1 sentence). We also print 10 sentences.
"""
from nltk import WhitespaceTokenizer
from nltk.util import bigrams
from collections import Counter
import random
"""First we open file and tokenize it"""
with open(input(), 'r', encoding='utf-8') as file:
    corpus = WhitespaceTokenizer().tokenize(file.read())
"""Then we make a bigram and organize a dictionary in fallowing manner:
key is a head of bigram, and values are all the tails for it.
"""
my_bigrams = list(bigrams(corpus))
bigrams_dict = {}
for head, tail in my_bigrams:
    bigrams_dict.setdefault(head, []).append(tail)
"""Then we create new dictionary: key is a head, and values is a dictionary,
with tails as keys and their count as values
"""
freq_dict = {}
for head, tails in bigrams_dict.items():
    freq_dict[head] = Counter(tails)

예제 #27
0
                lines_new = lines[startIndex:endIndex]
                tranche1 = lines_new[getIndex('Tranche 1', lines_new)]
                tranche2 = lines_new[getIndex('Tranche 2', lines_new)]
                tranche3 = lines_new[getIndex('Tranche 3', lines_new)]
                contract4 = lines_new[getIndex('Contract 4', lines_new)]

                contracts = [tranche1, tranche2, tranche3, contract4]
                for contract in contracts:
                    # Tokenize the lines
                    default_st = nltk.sent_tokenize
                    sentences = default_st(text=contract)

                    for sentence in sentences:
                        # print(sentence)
                        # Tokenize sentence into words, tag words' pos
                        ws = WhitespaceTokenizer()
                        words = ws.tokenize(sentence)
                        #print(words)

                        # Read contents into df
                        df['Field'] = [field]
                        df['Contract'] = [words[0] + ' ' + words[1]]
                        df['Date'] = [date]
                        df['SalesGas_mmscf'] = [words[3]]
                        df['Cond/Oil_bbl'] = [words[4]]

                        mother_df = mother_df.append(other=df, ignore_index=True)

            except TypeError:
                print('error: ' + f)
예제 #28
0
def main(_run, _config, _seed, _log):
    """

    :param _run:
    :param _config:
    :param _seed:
    :param _log:
    :return:
    """
    """
    Setting and loading parameters
    """
    # Setting logger
    args = _config
    logger = _log

    logger.info(args)
    logger.info('It started at: %s' % datetime.now())

    torch.manual_seed(_seed)
    bugReportDatabase = BugReportDatabase.fromJson(args['bug_database'])
    paddingSym = "</s>"
    batchSize = args['batch_size']

    device = torch.device('cuda' if args['cuda'] else "cpu")

    if args['cuda']:
        logger.info("Turning CUDA on")
    else:
        logger.info("Turning CUDA off")

    # It is the folder where the preprocessed information will be stored.
    cacheFolder = args['cache_folder']

    # Setting the parameter to save and loading parameters
    importantParameters = ['compare_aggregation', 'categorical']
    parametersToSave = dict([(parName, args[parName])
                             for parName in importantParameters])

    if args['load'] is not None:
        mapLocation = (
            lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu'
        modelInfo = torch.load(args['load'], map_location=mapLocation)
        modelState = modelInfo['model']

        for paramName, paramValue in modelInfo['params'].items():
            args[paramName] = paramValue
    else:
        modelState = None

    preprocessors = PreprocessorList()
    inputHandlers = []

    categoricalOpt = args.get('categorical')

    if categoricalOpt is not None and len(categoricalOpt) != 0:
        categoricalEncoder, _, _ = processCategoricalParam(
            categoricalOpt, bugReportDatabase, inputHandlers, preprocessors,
            None, logger)
    else:
        categoricalEncoder = None

    filterInputHandlers = []

    compareAggOpt = args['compare_aggregation']
    databasePath = args['bug_database']

    # Loading word embedding
    if compareAggOpt["lexicon"]:
        emb = np.load(compareAggOpt["word_embedding"])

        lexicon = Lexicon(unknownSymbol=None)
        with codecs.open(compareAggOpt["lexicon"]) as f:
            for l in f:
                lexicon.put(l.strip())

        lexicon.setUnknown("UUUKNNN")
        paddingId = lexicon.getLexiconIndex(paddingSym)
        embedding = Embedding(lexicon, emb, paddingIdx=paddingId)

        logger.info("Lexicon size: %d" % (lexicon.getLen()))
        logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize()))
    elif compareAggOpt["word_embedding"]:
        # todo: Allow use embeddings and other representation
        lexicon, embedding = Embedding.fromFile(
            compareAggOpt['word_embedding'],
            'UUUKNNN',
            hasHeader=False,
            paddingSym=paddingSym)
        logger.info("Lexicon size: %d" % (lexicon.getLen()))
        logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize()))
        paddingId = lexicon.getLexiconIndex(paddingSym)
    else:
        embedding = None

    if compareAggOpt["norm_word_embedding"]:
        embedding.zscoreNormalization()

    # Tokenizer
    if compareAggOpt['tokenizer'] == 'default':
        logger.info("Use default tokenizer to tokenize summary information")
        tokenizer = MultiLineTokenizer()
    elif compareAggOpt['tokenizer'] == 'white_space':
        logger.info(
            "Use white space tokenizer to tokenize summary information")
        tokenizer = WhitespaceTokenizer()
    else:
        raise ArgumentError(
            "Tokenizer value %s is invalid. You should choose one of these: default and white_space"
            % compareAggOpt['tokenizer'])

    # Preparing input handlers, preprocessors and cache
    minSeqSize = max(compareAggOpt['aggregate']["window"]
                     ) if compareAggOpt['aggregate']["model"] == "cnn" else -1
    bow = compareAggOpt.get('bow', False)
    freq = compareAggOpt.get('frequency', False) and bow

    logger.info("BoW={} and TF={}".format(bow, freq))

    if compareAggOpt['extractor'] is not None:
        # Use summary and description (concatenated) to address this problem
        logger.info("Using Summary and Description information.")
        # Loading Filters
        extractorFilters = loadFilters(compareAggOpt['extractor']['filters'])

        arguments = (databasePath, compareAggOpt['word_embedding'],
                     str(compareAggOpt['lexicon']), ' '.join(
                         sorted([
                             fil.__class__.__name__ for fil in extractorFilters
                         ])), compareAggOpt['tokenizer'], str(bow), str(freq),
                     SABDEncoderPreprocessor.__name__)

        inputHandlers.append(SABDInputHandler(paddingId, minSeqSize))
        extractorCache = PreprocessingCache(cacheFolder, arguments)

        if bow:
            extractorPreprocessor = SABDBoWPreprocessor(
                lexicon, bugReportDatabase, extractorFilters, tokenizer,
                paddingId, freq, extractorCache)
        else:
            extractorPreprocessor = SABDEncoderPreprocessor(
                lexicon, bugReportDatabase, extractorFilters, tokenizer,
                paddingId, extractorCache)
        preprocessors.append(extractorPreprocessor)

    # Create model
    model = SABD(embedding, categoricalEncoder, compareAggOpt['extractor'],
                 compareAggOpt['matching'], compareAggOpt['aggregate'],
                 compareAggOpt['classifier'], freq)

    if args['loss'] == 'bce':
        logger.info("Using BCE Loss: margin={}".format(args['margin']))
        lossFn = BCELoss()
        lossNoReduction = BCELoss(reduction='none')
        cmp_collate = PairBugCollate(inputHandlers,
                                     torch.float32,
                                     unsqueeze_target=True)
    elif args['loss'] == 'triplet':
        logger.info("Using Triplet Loss: margin={}".format(args['margin']))
        lossFn = TripletLoss(args['margin'])
        lossNoReduction = TripletLoss(args['margin'], reduction='none')
        cmp_collate = TripletBugCollate(inputHandlers)

    model.to(device)

    if modelState:
        model.load_state_dict(modelState)
    """
    Loading the training and validation. Also, it sets how the negative example will be generated.
    """
    # load training
    if args.get('pairs_training'):
        negativePairGenOpt = args.get('neg_pair_generator', )
        trainingFile = args.get('pairs_training')

        offlineGeneration = not (negativePairGenOpt is None
                                 or negativePairGenOpt['type'] == 'none')
        masterIdByBugId = bugReportDatabase.getMasterIdByBugId()
        randomAnchor = negativePairGenOpt['random_anchor']

        if not offlineGeneration:
            logger.info("Not generate dynamically the negative examples.")
            negativePairGenerator = None
        else:
            pairGenType = negativePairGenOpt['type']

            if pairGenType == 'random':
                logger.info("Random Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = RandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    randomAnchor=randomAnchor)

            elif pairGenType == 'non_negative':
                logger.info("Non Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = NonNegativeRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == 'misc_non_zero':
                logger.info("Misc Non Zero Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = MiscNonZeroRandomGen(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    trainingDataset.duplicateIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == 'product_component':
                logger.info("Product Component Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = ProductComponentRandomGen(
                    bugReportDatabase,
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)

            elif pairGenType == 'random_k':
                logger.info("Random K Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = KRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['k'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == "pre":
                logger.info("Pre-selected list generator")
                negativePairGenerator = PreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)

            elif pairGenType == "positive_pre":
                logger.info("Positive Pre-selected list generator")
                negativePairGenerator = PositivePreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)
            elif pairGenType == "misc_non_zero_pre":
                logger.info("Misc: non-zero and Pre-selected list generator")
                negativePairGenerator1 = PreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)

                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                negativePairGenerator2 = NonNegativeRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)

                negativePairGenerator = MiscOfflineGenerator(
                    (negativePairGenerator1, negativePairGenerator2))
            elif pairGenType == "misc_non_zero_positive_pre":
                logger.info(
                    "Misc: non-zero and Positive Pre-selected list generator")
                negativePairGenerator1 = PositivePreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)

                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                negativePairGenerator2 = NonNegativeRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)

                negativePairGenerator = MiscOfflineGenerator(
                    (negativePairGenerator1, negativePairGenerator2))

            else:
                raise ArgumentError(
                    "Offline generator is invalid (%s). You should choose one of these: random, hard and pre"
                    % pairGenType)

        if isinstance(lossFn, BCELoss):
            training_reader = PairBugDatasetReader(
                trainingFile,
                preprocessors,
                negativePairGenerator,
                randomInvertPair=args['random_switch'])
        elif isinstance(lossFn, TripletLoss):
            training_reader = TripletBugDatasetReader(
                trainingFile,
                preprocessors,
                negativePairGenerator,
                randomInvertPair=args['random_switch'])

        trainingLoader = DataLoader(training_reader,
                                    batch_size=batchSize,
                                    collate_fn=cmp_collate.collate,
                                    shuffle=True)
        logger.info("Training size: %s" % (len(trainingLoader.dataset)))

    # load validation
    if args.get('pairs_validation'):
        if isinstance(lossFn, BCELoss):
            validation_reader = PairBugDatasetReader(
                args.get('pairs_validation'), preprocessors)
        elif isinstance(lossFn, TripletLoss):
            validation_reader = TripletBugDatasetReader(
                args.get('pairs_validation'), preprocessors)

        validationLoader = DataLoader(validation_reader,
                                      batch_size=batchSize,
                                      collate_fn=cmp_collate.collate)

        logger.info("Validation size: %s" % (len(validationLoader.dataset)))
    else:
        validationLoader = None
    """
    Training and evaluate the model. 
    """
    optimizer_opt = args.get('optimizer', 'adam')

    if optimizer_opt == 'sgd':
        logger.info('SGD')
        optimizer = optim.SGD(model.parameters(),
                              lr=args['lr'],
                              weight_decay=args['l2'])
    elif optimizer_opt == 'adam':
        logger.info('Adam')
        optimizer = optim.Adam(model.parameters(),
                               lr=args['lr'],
                               weight_decay=args['l2'])

    # Recall rate
    rankingScorer = GeneralScorer(
        model, preprocessors, device,
        PairBugCollate(inputHandlers, ignore_target=True),
        args['ranking_batch_size'], args['ranking_n_workers'])
    recallEstimationTrainOpt = args.get('recall_estimation_train')

    if recallEstimationTrainOpt:
        preselectListRankingTrain = PreselectListRanking(
            recallEstimationTrainOpt, args['sample_size_rr_tr'])

    recallEstimationOpt = args.get('recall_estimation')

    if recallEstimationOpt:
        preselectListRanking = PreselectListRanking(recallEstimationOpt,
                                                    args['sample_size_rr_val'])

    # LR scheduler
    lrSchedulerOpt = args.get('lr_scheduler', None)

    if lrSchedulerOpt is None:
        logger.info("Scheduler: Constant")
        lrSched = None
    elif lrSchedulerOpt["type"] == 'step':
        logger.info("Scheduler: StepLR (step:%s, decay:%f)" %
                    (lrSchedulerOpt["step_size"], args["decay"]))
        lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"],
                         lrSchedulerOpt["decay"])
    elif lrSchedulerOpt["type"] == 'exp':
        logger.info("Scheduler: ExponentialLR (decay:%f)" %
                    (lrSchedulerOpt["decay"]))
        lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"])
    elif lrSchedulerOpt["type"] == 'linear':
        logger.info(
            "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" %
            (lrSchedulerOpt["decay"]))

        lrDecay = lrSchedulerOpt["decay"]
        lrSched = LambdaLR(optimizer, lambda epoch: 1 /
                           (1.0 + epoch * lrDecay))
    else:
        raise ArgumentError(
            "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear "
            % pairGenType)

    # Set training functions
    def trainingIteration(engine, batch):
        engine.kk = 0
        model.train()

        optimizer.zero_grad()
        x, y = cmp_collate.to(batch, device)
        output = model(*x)
        loss = lossFn(output, y)
        loss.backward()
        optimizer.step()
        return loss, output, y

    def scoreDistanceTrans(output):
        if len(output) == 3:
            _, y_pred, y = output
        else:
            y_pred, y = output

        if lossFn == F.nll_loss:
            return torch.exp(y_pred[:, 1]), y
        elif isinstance(lossFn, (BCELoss)):
            return y_pred, y

    trainer = Engine(trainingIteration)
    trainingMetrics = {'training_loss': AverageLoss(lossFn)}

    if isinstance(lossFn, BCELoss):
        trainingMetrics['training_dist_target'] = MeanScoreDistance(
            output_transform=scoreDistanceTrans)
        trainingMetrics['training_acc'] = AccuracyWrapper(
            output_transform=thresholded_output_transform)
        trainingMetrics['training_precision'] = PrecisionWrapper(
            output_transform=thresholded_output_transform)
        trainingMetrics['training_recall'] = RecallWrapper(
            output_transform=thresholded_output_transform)
        # Add metrics to trainer
    for name, metric in trainingMetrics.items():
        metric.attach(trainer, name)

    # Set validation functions
    def validationIteration(engine, batch):
        if not hasattr(engine, 'kk'):
            engine.kk = 0

        model.eval()

        with torch.no_grad():
            x, y = cmp_collate.to(batch, device)
            y_pred = model(*x)

            return y_pred, y

    validationMetrics = {
        'validation_loss':
        LossWrapper(lossFn,
                    output_transform=lambda x: (x[0], x[0][0])
                    if x[1] is None else x)
    }

    if isinstance(lossFn, BCELoss):
        validationMetrics['validation_dist_target'] = MeanScoreDistance(
            output_transform=scoreDistanceTrans)
        validationMetrics['validation_acc'] = AccuracyWrapper(
            output_transform=thresholded_output_transform)
        validationMetrics['validation_precision'] = PrecisionWrapper(
            output_transform=thresholded_output_transform)
        validationMetrics['validation_recall'] = RecallWrapper(
            output_transform=thresholded_output_transform)

    evaluator = Engine(validationIteration)

    # Add metrics to evaluator
    for name, metric in validationMetrics.items():
        metric.attach(evaluator, name)

    # recommendation
    recommendation_fn = generateRecommendationList

    @trainer.on(Events.EPOCH_STARTED)
    def onStartEpoch(engine):
        epoch = engine.state.epoch
        logger.info("Epoch: %d" % epoch)

        if lrSched:
            lrSched.step()

        logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"]))

    @trainer.on(Events.EPOCH_COMPLETED)
    def onEndEpoch(engine):
        epoch = engine.state.epoch

        logMetrics(_run, logger, engine.state.metrics, epoch)

        # Evaluate Training
        if validationLoader:
            evaluator.run(validationLoader)
            logMetrics(_run, logger, evaluator.state.metrics, epoch)

        lastEpoch = args['epochs'] - epoch == 0

        if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0):
            logRankingResult(_run,
                             logger,
                             preselectListRankingTrain,
                             rankingScorer,
                             bugReportDatabase,
                             None,
                             epoch,
                             "train",
                             recommendationListfn=recommendation_fn)
            rankingScorer.free()

        if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0):
            logRankingResult(_run,
                             logger,
                             preselectListRanking,
                             rankingScorer,
                             bugReportDatabase,
                             args.get("ranking_result_file"),
                             epoch,
                             "validation",
                             recommendationListfn=recommendation_fn)
            rankingScorer.free()

        if not lastEpoch:
            training_reader.sampleNewNegExamples(model, lossNoReduction)

        if args.get('save'):
            save_by_epoch = args['save_by_epoch']

            if save_by_epoch and epoch in save_by_epoch:
                file_name, file_extension = os.path.splitext(args['save'])
                file_path = file_name + '_epoch_{}'.format(
                    epoch) + file_extension
            else:
                file_path = args['save']

            modelInfo = {
                'model': model.state_dict(),
                'params': parametersToSave
            }

            logger.info("==> Saving Model: %s" % file_path)
            torch.save(modelInfo, file_path)

    if args.get('pairs_training'):
        trainer.run(trainingLoader, max_epochs=args['epochs'])
    elif args.get('pairs_validation'):
        # Evaluate Training
        evaluator.run(validationLoader)
        logMetrics(_run, logger, evaluator.state.metrics, 0)

        if recallEstimationOpt:
            logRankingResult(_run,
                             logger,
                             preselectListRanking,
                             rankingScorer,
                             bugReportDatabase,
                             args.get("ranking_result_file"),
                             0,
                             "validation",
                             recommendationListfn=recommendation_fn)

    # Test Dataset (accuracy, recall, precision, F1)
    pair_test_dataset = args.get('pair_test_dataset')

    if pair_test_dataset is not None and len(pair_test_dataset) > 0:
        pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors)
        testLoader = DataLoader(pairTestReader,
                                batch_size=batchSize,
                                collate_fn=cmp_collate.collate)

        if not isinstance(cmp_collate, PairBugCollate):
            raise NotImplementedError(
                'Evaluation of pairs using tanh was not implemented yet')

        logger.info("Test size: %s" % (len(testLoader.dataset)))

        testMetrics = {
            'test_accuracy':
            ignite.metrics.Accuracy(
                output_transform=thresholded_output_transform),
            'test_precision':
            ignite.metrics.Precision(
                output_transform=thresholded_output_transform),
            'test_recall':
            ignite.metrics.Recall(
                output_transform=thresholded_output_transform),
            'test_predictions':
            PredictionCache(),
        }
        test_evaluator = Engine(validationIteration)

        # Add metrics to evaluator
        for name, metric in testMetrics.items():
            metric.attach(test_evaluator, name)

        test_evaluator.run(testLoader)

        for metricName, metricValue in test_evaluator.state.metrics.items():
            metric = testMetrics[metricName]

            if isinstance(metric, ignite.metrics.Accuracy):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'value': metricValue,
                    'epoch': None,
                    'correct': metric._num_correct,
                    'total': metric._num_examples
                })
                _run.log_scalar(metricName, metricValue)
            elif isinstance(metric,
                            (ignite.metrics.Precision, ignite.metrics.Recall)):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'value': metricValue,
                    'epoch': None,
                    'tp': metric._true_positives.item(),
                    'total_positive': metric._positives.item()
                })
                _run.log_scalar(metricName, metricValue)
            elif isinstance(metric, ConfusionMatrix):
                acc = cmAccuracy(metricValue)
                prec = cmPrecision(metricValue, False)
                recall = cmRecall(metricValue, False)
                f1 = 2 * (prec * recall) / (prec + recall + 1e-15)

                logger.info({
                    'type':
                    'metric',
                    'label':
                    metricName,
                    'accuracy':
                    np.float(acc),
                    'precision':
                    prec.cpu().numpy().tolist(),
                    'recall':
                    recall.cpu().numpy().tolist(),
                    'f1':
                    f1.cpu().numpy().tolist(),
                    'confusion_matrix':
                    metricValue.cpu().numpy().tolist(),
                    'epoch':
                    None
                })

                _run.log_scalar('test_f1', f1[1])
            elif isinstance(metric, PredictionCache):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'predictions': metric.predictions
                })

    # Calculate recall rate
    recallRateOpt = args.get('recall_rate', {'type': 'none'})
    if recallRateOpt['type'] != 'none':
        if recallRateOpt['type'] == 'sun2011':
            logger.info("Calculating recall rate: {}".format(
                recallRateOpt['type']))
            recallRateDataset = BugDataset(recallRateOpt['dataset'])

            rankingClass = SunRanking(bugReportDatabase, recallRateDataset,
                                      recallRateOpt['window'])
            # We always group all bug reports by master in the results in the sun 2011 methodology
            group_by_master = True
        elif recallRateOpt['type'] == 'deshmukh':
            logger.info("Calculating recall rate: {}".format(
                recallRateOpt['type']))
            recallRateDataset = BugDataset(recallRateOpt['dataset'])
            rankingClass = DeshmukhRanking(bugReportDatabase,
                                           recallRateDataset)
            group_by_master = recallRateOpt['group_by_master']
        else:
            raise ArgumentError(
                "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear "
                % recallRateOpt['type'])

        logRankingResult(_run,
                         logger,
                         rankingClass,
                         rankingScorer,
                         bugReportDatabase,
                         recallRateOpt["result_file"],
                         0,
                         None,
                         group_by_master,
                         recommendationListfn=recommendation_fn)
예제 #29
0
from bs4 import BeautifulSoup
import re
from nltk import ngrams
import csv


german_stopwords = stopwords.words('german')
english_stopwords = stopwords.words("english")
#https://www.pc-erfahrung.de/nebenrubriken/sonstiges/webdesignwebentwicklung/stoppwortliste.html

browser = webdriver.Chrome('/home/marco/Downloads/chromedriver')
browser.implicitly_wait(10)
stemmer = SnowballStemmer('german')
stemmed_words = []
all_bigrams = []
whitespace_wt = WhitespaceTokenizer()


def compare_text_with_keywords(html_string):
    all_formatted_words = []
    all_stemmed_words = []

    # to split a string at uppercase, MarcoLang  will be Marco Lang
    splitted_string_by_uppercase = re.findall('[A-Z][^A-Z]*', html_string)
    string = str(splitted_string_by_uppercase)
    # formatted_string = string.replace(","," ").replace("'","").replace("[","").replace("]","").replace("\\n","")
    formatted_string = string.replace("\\n", "")  # to delete all line breaks
    splitted_words = word_tokenize(formatted_string.lower())
    for word in splitted_words:
        formatted_word = ''.join(e for e in word if (e.isalnum()))
        if formatted_word is not '':
예제 #30
0
파일: tf_idf.py 프로젝트: fuzzysound/tf-idf
import pymysql
import math
from nltk import WhitespaceTokenizer

connection = pymysql.connect(host="127.0.0.1",
                             user="******",
                             password="******",
                             charset='utf8',
                             db='tf-idf',
                             cursorclass=pymysql.cursors.DictCursor)

cursor = connection.cursor()
tokenizer = WhitespaceTokenizer()

sql_tf = 'SELECT * FROM wiki WHERE id=%s'
sql_idf = 'SELECT COUNT(*) count FROM inverted_index WHERE term=%s'


def tf_idf(id, term):
    cursor.execute(sql_tf, id)
    text = cursor.fetchone()['text']
    words = tokenizer.tokenize(text)
    words = list(map(lambda x: x.lower(), words))
    tf = math.log(1 + words.count(term) / len(words))
    cursor.execute(sql_idf, term)
    idf = 1 / cursor.fetchone()['count']
    print('TF-IDF of the term ' + term + ' in ID=' + str(id) + ': ' +
          str(tf * idf))


terms = [(41631770, 'also'), (6688599, 'debut'), (13794826, 'language')]
from collections import defaultdict, Counter
from nltk import WhitespaceTokenizer, bigrams
from random import choice, choices

bigrams_list = []
bigrams_dict = defaultdict(Counter)

with open(input(), 'r', encoding='utf-8') as file:
    for line in file:
        bigrams_list.extend(bigrams(WhitespaceTokenizer().tokenize(line)))

for head, tail in bigrams_list:
    bigrams_dict[head][tail] += 1

heads = list(bigrams_dict.keys())

first_word_set = set()
while len(first_word_set) < 10:
    word = choice(heads)
    if word[0].isupper() and word[-1] not in ".!?":
        first_word_set.add(word)

for word in first_word_set:
    sentence = [word]
    while len(sentence) < 5 or sentence[-1][-1] not in ".!?":
        next_words = list(bigrams_dict[sentence[-1]].keys())
        next_words_count = list(bigrams_dict[sentence[-1]].values())
        sentence.extend(choices(next_words, weights=next_words_count))
    print(" ".join(sentence))