Пример #1
0
def extract_ngrams(df, key, by=[], ng_range=(1, 1), pad=False):
    """
        nest terms as ngrams 
    Args:
    -----
    df : DataFrame with columns: term, author, doc_id
    ng_range : (min_gram, max_gram) 
    by : list containing fileds to group by
    pad : whether to add <start>/<end> symbols when extracting n-grams
    """

    if pad:
        new_df = df.groupby(by)[key]\
                .apply(lambda x : list(everygrams(x, min_len=ng_range[0],
                                              max_len=ng_range[1],
                                             pad_left=True,
                                             pad_right=True,
                                             left_pad_symbol='<start>',
                                             right_pad_symbol='<end>'
                                             )))\
        .explode()\
        .reset_index()
    else:
        new_df = df.groupby(by)[key]\
            .apply(lambda x : list(everygrams(x, min_len=ng_range[0],
                                              max_len=ng_range[1]
                                             )))\
        .explode()\
        .reset_index()
    return new_df
Пример #2
0
def get_feature_weights(raw, lower_nopunc, raw_nopunc, cls, tfidf,
                        gram_length):
    X = tfidf.transform([lower_nopunc])
    coef = cls.coef_[0]
    lower_grams = list(
        everygrams(lower_nopunc.split(), gram_length, gram_length))
    original_grams = list(
        everygrams(raw_nopunc.split(), gram_length, gram_length))
    #   print(lower_grams)
    # print(len(original_grams), len(lower_grams))
    vocab = tfidf.vocabulary_
    weights = []
    for i in range(len(lower_grams)):
        lower_token, original_token = " ".join(lower_grams[i]), " ".join(
            original_grams[i])
        if lower_token in vocab and original_token in raw:
            idx = vocab[lower_token]
            weights.append([original_token, coef[idx] * tfidf.idf_[idx]])
        else:
            pass
            # print(lower_token)
    filtered = []
    if gram_length > 1:
        filtered = remove_overlap(weights, lower_nopunc, raw, raw_nopunc)
    else:
        weights = remove_duplicate(weights)

    weights.sort(key=lambda x: abs(x[1]), reverse=True)
    return weights, filtered
Пример #3
0
def json_reader(fname, count=1000, stemming=False, bigrams=False):
    """
        Read multiple json files
        Args:
            fname: str: input file
        Returns:
            generator: iterator over documents 
    """
    en_stop = set(stopwords.words('english'))
    p_stemmer = PorterStemmer()

    def convertAscii(text):
        return ''.join([i if ord(i) < 128 else '' for i in text])

    for line in open(fname, mode="r"):
        if (count <= 0):
            break
        count -= 1

        rating = re.search('stars": (.+?),', line)
        if rating:
            rating = int(float(rating.group(1)))
        else:
            print('contin')
            continue

        review = re.search('"text": "(.+?)"', line)
        if review:
            review = review.group(1)
        else:
            print('contin')
            continue

        review = np.array(word_tokenize(review))
        if (not stemming):
            if bigrams:
                if (len(review) < 3):  # ignore it
                    continue
                review = np.array(list(everygrams(review, 1, 2)))
            yield {'rating': rating, 'review': review}
        else:
            stopped_tokens = filter(lambda token: token not in en_stop, review)
            stemmed_tokens = map(lambda token: p_stemmer.stem(token),
                                 stopped_tokens)
            review = np.array(list(stemmed_tokens))
            if bigrams:
                review = np.array(list(everygrams(review, 1, 2)))
            yield {'rating': rating, 'review': review}
Пример #4
0
def get_ranks(doc_id,
              doc,
              get_weight,
              get_personalization=None,
              weight='weight'):
    G = nx.Graph(name=doc_id)
    doc = nlp(doc)

    for sentence in doc.sents:
        tokens = [str(t) for t in sentence if t.is_alpha and not t.is_stop]
        grams = everygrams(tokens, min_len=1, max_len=3)
        grams = [' '.join(g) for g in grams]
        G.add_nodes_from(grams)

        edges = list(itertools.combinations(grams, 2))
        weighted_edges = [(v1, v2, get_weight(v1, v2)) for v1, v2 in edges]
        G.add_weighted_edges_from(weighted_edges)

    personalization = {node: get_personalization(node)
                       for node in G.nodes} if get_personalization else None
    rank = nx.pagerank(G,
                       alpha=1 - 0.15,
                       max_iter=50,
                       weight=weight,
                       personalization=personalization)
    return rank
Пример #5
0
def activity_two(frec_letters, msg, n_grams):

    R = math.log(len(frec_letters), 2)
    print("\nRango absoluto R = ", R)
    # Creamos diferentes gramas
    grams = list(everygrams(msg.lower(), n_grams, n_grams))
    # Convertirmos a "set" para quitar elementos repetidos
    realgram = list(set(grams))

    # Calcular rangos r
    rangos_grams = graficar_rangos(n_grams + 1, msg.lower(), R)
    # Imprimar los rangos
    print("\nrangos 'r' para cada n-gram")
    for i in range(n_grams):
        print("n-gram[", i + 1, "] - R =", rangos_grams[i])

    # Redundancia para cada n
    print("\nRedundancia 'D' para cada rango 'r'")
    for i in range(n_grams):
        print("n-gram[", i + 1, "] - D =", R - rangos_grams[i])

    print("\nObservar figura 'grafica_rango.png'")
    # Cantidad de información de cada char
    # ademas de entroopia
    entropia = 0
    print("\nBits de informacion para cada simbolo")
    for key, frec in frec_letters:
        bit_info = math.log2(1 / frec)
        print(key, bit_info)
        entropia += (frec * bit_info)
    print("\nEntropia = ", entropia)
Пример #6
0
def get_new_ngrams(texts, n=3, vocabulary=None):
    ngrams = {}
    for item in texts:
        # Processing incoming text:
        text = item.get("text")
        probability = item.get("probability")
        text = re.sub(r"[^а-я\-\s]", "", text.lower().strip())

        for i in range(1, n + 1):
            # Getting i-grams:
            igrams = list(nltk.everygrams(text.split(), i, i))

            # Removing i-grams that do not have any new words
            bad_indeces = []
            for index, igram in enumerate(igrams):
                bad_igram = True
                for word in igram:
                    if word not in vocabulary:
                        bad_igram = False
                        break
                if bad_igram:
                    bad_indeces.append(index)
            for bad_index in bad_indeces[::-1]:
                igrams.pop(bad_index)

            # Collecting i-grams with new words
            if len(igrams) > 0:
                if i not in ngrams:
                    ngrams[i] = {}
                ngrams[i].update({
                    " ".join(igram): probability
                    for igram in igrams if " ".join(igram) not in ngrams[i]
                    or probability > ngrams[i][" ".join(igram)]
                })
    return ngrams
Пример #7
0
def freq_by_cuisine():
    #create word frequency line plots for each cuisine in the database

    df = pd.read_csv('concat_uncleaned_recipes.csv').dropna()
    #read in the file and drop out the nans

    df['Ingredients'] = df.apply(
        lambda row: ' '.join(pre_processing.clean_strings(row['Ingredients'])),
        axis=1)
    #create ingredients list for each recipe
    df_freq_mex = df[df['Cuisine'] == 2]
    df_freq_ital = df[df['Cuisine'] == 3]
    df_freq_fren = df[df['Cuisine'] == 5]
    df_freq_amer = df[df['Cuisine'] == 6]
    df_freq_brit = df[df['Cuisine'] == 7]
    df_freq_ch = df[df['Cuisine'] == 8]
    df_freq_ind = df[df['Cuisine'] == 9]
    df_freq_japan = df[df['Cuisine'] == 13]

    df_list = [
        df_freq_mex, df_freq_ital, df_freq_fren, df_freq_amer, df_freq_brit,
        df_freq_ch, df_freq_ind, df_freq_japan
    ]

    for cuisine in df_list:
        #go through each cuisine and plot their word frequency for the top 20 terms
        data = cuisine['Ingredients'].apply(
            lambda row: list(everygrams(row.split(' '), min_len=2, max_len=2)))
        flat_data = [item for sublist in data for item in sublist]
        fdist = FreqDist(flat_data)

        print(fdist.most_common(20))
        word_distro_plot(fdist)
Пример #8
0
def extract_skills(input_text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    word_tokens = nltk.tokenize.word_tokenize(input_text)

    # remove the stop words
    filtered_tokens = [w for w in word_tokens if w not in stop_words]

    # remove the punctuation
    filtered_tokens = [w for w in word_tokens if w.isalpha()]

    # generate bigrams and trigrams (such as artificial intelligence)
    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3)))

    # we create a set to keep the results in.
    found_skills = set()

    # we search for each token in our skills database
    for token in filtered_tokens:
        if token.lower() in SKILLS_DB:
            found_skills.add(token)

    # we search for each bigram and trigram in our skills database
    for ngram in bigrams_trigrams:
        if ngram.lower() in SKILLS_DB:
            found_skills.add(ngram)

    return found_skills
Пример #9
0
def get_ngram_in_query(docs, query, min_n, max_n):
    docs = preprocess_docs(docs, query)
    ngram = []
    for doc in docs:
        for sent in doc.split_sent():
            ngram += [
                ''.join(token) for token in everygrams(sent, min_n, max_n)
                for char in query if char in token
            ]
    return ngram
Пример #10
0
def long_gram_first(ngram):
    for token in ngram:
        tf = ngram[token]
        for short in [
                ''.join(short) for short in everygrams(token,
                                                       len(token) - 1,
                                                       len(token) - 1)
        ]:
            ngram[short] -= tf
    return ngram
Пример #11
0
def graficar_rangos(max_rango, msg, R):
    lis_values_rangos = []
    for i in range(1, max_rango):
        # Creamos diferentes gramas
        grams = list(everygrams(msg, i, i))
        # Convertirmos a "set" para quitar elementos repetidos
        realgram = list(set(grams))
        # print(realgram)
        r = math.log(len(realgram), 2**i)
        lis_values_rangos.append(r)
    return lis_values_rangos
Пример #12
0
def check_spelling(word):
    d = enchant.Dict('en_US')
    try:
        if d.check(word):
            return word.strip()
        else:
            sub_string_arr = sub_string_arr=[''.join(_ngram) for _ngram in everygrams( word) if d.check(''.join(_ngram)) and len(_ngram) > 1]
            word = sub_string_arr [len(sub_string_arr) - 1].strip()
            ''.join(e for e in word if e.isalpha())
            return word
    except:
        return False
Пример #13
0
def tokenize_lemmatize_ngram(sentence):
    sentence = sentence.lower().strip('"')
    lemmatizer = WordNetLemmatizer()
    tokenized_list = nltk.word_tokenize(sentence)
    tokenized_list = [
        word for word in tokenized_list if word not in stop_words
    ]

    lemmatized_word_list = [lemmatizer.lemmatize(w) for w in tokenized_list]
    every_gramm_list = list(everygrams(lemmatized_word_list, 2, 4))
    #print(every_gramm_list)
    return [' '.join(gram) for gram in every_gramm_list]
Пример #14
0
 def tokenize(self, text: str) -> List[Token]:
     """
     Splits sentences into a set of all possible ngrams up to self._max_ngram_degree using nltk
     """
     ngrams_iterator = everygrams(text.split(),
                                  max_len=self._max_ngram_degree)
     tokens = [Token(" ".join(ngram)) for ngram in ngrams_iterator]
     for start_token in self._start_tokens:
         tokens.insert(0, Token(start_token, 0))
     for end_token in self._end_tokens:
         tokens.append(Token(end_token, -1))
     return tokens
Пример #15
0
def ngrams_to_topics(phrases, merge=True, min_similarity=.96):
    # Core analysis: find matches
    found_topics = {}
    successful_grams = {}
    for concept in phrases:
        for ngram in everygrams(concept.split(), 1, 3):
            # TODO: pick between 'phrase' and 'concept' terminology
            concept = "_".join(ngram)
            if concept in MODEL:
                # there's an exact match for the '_'-concatenated ngram in the ontology
                matches = MODEL[concept]
            else:
                # we'll instead search for ontology elements proximate in vector space
                matches = match_ngram(ngram, merge=merge)
            for match in matches:
                topic = match["topic"]
                sim_t = match["sim_t"]
                wet = match["wet"]
                sim_w = match["sim_w"]
                if sim_t >= min_similarity and topic in CSO["topics_wu"]:
                    if topic in found_topics:
                        # tracking this match
                        found_topics[topic]["times"] += 1
                        found_topics[topic]["gram_similarity"].append(sim_w)
                        # tracking the matched gram
                        if concept in found_topics[topic]["grams"]:
                            found_topics[topic]["grams"][concept] += 1
                        else:
                            found_topics[topic]["grams"][concept] = 1
                        # tracking the most similar gram to the topic
                        if sim_t > found_topics[topic]["embedding_similarity"]:
                            found_topics[topic]["embedding_similarity"] = sim_t
                            found_topics[topic]["embedding_matched"] = wet
                    else:
                        # creating new topic in the result set
                        found_topics[topic] = {
                            'grams': {
                                concept: 1
                            },
                            'embedding_matched': wet,
                            'embedding_similarity': sim_t,
                            'gram_similarity': [sim_w],
                            'times': 1,
                            'topic': topic
                        }
                    if sim_w == 1:
                        found_topics[topic]["syntactic"] = True
                    # reporting successful grams: it is the inverse of found_topics["topic"]["grams"]
                    if concept in successful_grams:
                        successful_grams[concept].append(topic)
                    else:
                        successful_grams[concept] = [topic]
    return found_topics, successful_grams
Пример #16
0
def cal_freq_ngrams(word_lv_sents):
    gram_freq = dict()
    with tqdm(total=len(word_lv_sents)) as bar:
        for sent in word_lv_sents:
            max_l = min([len(sent), 4])
            for gram in everygrams(sent, max_len=max_l):
                if gram not in gram_freq:
                    gram_freq[gram] = 1
                else:
                    gram_freq[gram] += 1
            bar.update()
    return gram_freq
Пример #17
0
def extract_skills(corpus, filename):
    '''
    Parses a string to extract resume skills

    Parameters:
        corpus (string): The extracted text of a resume
        filename (string): The filepath to the resume being parsed

    Returns:
        skills (set): The extracted skills of the corpus
    '''
    global UNKNOWNS

    stop_words = set(nltk.corpus.stopwords.words('english'))
    word_tokens = nltk.tokenize.word_tokenize(corpus)

    filtered_tokens = [
        w.lower() for w in word_tokens if w not in stop_words and w.isalpha()
    ]

    bitri = nltk.everygrams(filtered_tokens, 2, 3)
    filtered_tokens = set(filtered_tokens)
    for gram in bitri:
        gram = ' '.join(gram)
        gram = gram.lower()
        filtered_tokens.add(gram)

    db = DatabaseInterface()
    skills = db.getKnownSkills(filtered_tokens)
    unknown_skills = db.getUnknowns(filtered_tokens)
    UNKNOWNS = unknown_skills.copy()

    try:
        session.run(asyncAPICheck)
    except SystemExit:
        pass

    db.recordSkills(UNKNOWNS)
    db.recordNotSkills(unknown_skills.difference(UNKNOWNS))

    extraction_package_skills = set([
        elem.lower()
        for elem in ResumeParser(filename).get_extracted_data()['skills']
    ])
    db.recordSkills(extraction_package_skills)

    skills = skills.union(extraction_package_skills)
    skills = skills.union(UNKNOWNS)

    db.close()

    return skills
Пример #18
0
def extract_people(data, list1):
    """
    Extracts potential People nominees from an individual tweet
    """

    result = []

    translator = str.maketrans('', '', string.punctuation)
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'golden', 'globe', 'globes']
    stop = remove_terms + list1

    for tweet in data:

        tweet = re.sub("\d+", "", tweet)       #strip nums
        tweet = re.sub(r'http\S+', '', tweet)  #strip urls
        tweet = re.sub(r'#\S+', '', tweet)     #strip hashtags
        tweet = tweet.translate(translator)    #strip non-alphanumeric characters
        tweet = tweet.split()                  #tokenize
        tweet = [term for term in tweet if term.lower() not in stop_words] #remove stop words
        for i in stop:
            for j in tweet:
                if i.lower() in j.lower():
                    tweet.remove(j)
        result.append(tweet)



    grams = [];

    for tweet in result:
        if tweet:
            # Get all possible bigrams & trigrams in a tweet
            gram = list(nltk.everygrams(tweet, 2, 3))

            # Filter through and append to list for tweet
            for g in gram:
                if len(g) == 2:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])):
                        grams.append(' '.join(g))
                else:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[2])):
                        grams.append(' '.join(g))


    fdist = nltk.FreqDist(grams)

    try:
        names = fdist.most_common()
    except:
        names = ""

    return names
Пример #19
0
    def get_word_vector(self,
                        word: str,
                        lemma: str,
                        pos_tag: str = '',
                        morph_tags: Tuple[str] = tuple()):
        morphemes = self.word2morph[lemma]
        ngrams = tuple([''.join(g) for g in everygrams(word, min_len=self.ngram_min_len, max_len=self.ngram_max_len)])

        vector = self.morph2vec.get_vector(word=word, lemma=lemma, pos=pos_tag,
                                           morph_tags=morph_tags,
                                           morphemes=morphemes.segments,
                                           ngrams=ngrams)
        return vector
Пример #20
0
    def train(self):
        logger.info("Training model...")
        logger.info("tokenizing...")

        corpus = " ".join(self.df["haiku"])
        if self.tokenization == "words":
            tokens = nltk.word_tokenize(corpus)
        elif self.tokenization == "characters":
            tokens = list(corpus)
        ngrams = nltk.everygrams(tokens, max_len=self.order)
        logger.info("fitting...")
        self.model.fit([ngrams], vocabulary_text=self.vocab)
        logger.info("Trained model.")
Пример #21
0
def extract_presenters(data, list1, winners):
    result = []

    translator = str.maketrans('', '', string.punctuation)
    remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'golden', 'globe', 'globes']

    if winners:
        stop = remove_terms + list1 + winners.split()
    else:
        stop = remove_terms + list1

    for tweet in data:

        tweet = re.sub("\d+", "", tweet) #strip nums
        tweet = re.sub(r'http\S+', '', tweet) #strip urls
        tweet = re.sub(r'#\S+', '', tweet) #strip hashtags
        tweet = tweet.translate(translator) #strip non-alphanumeric characters
        tweet = tweet.split() #tokenize

        for i in stop:
            for j in tweet:
                if i.lower() in j.lower():
                    tweet.remove(j)
        result.append(tweet)


    grams = [];

    for tweet in result:
        if tweet:
            gram = list(nltk.everygrams(tweet, 2, 3))
            for g in gram:
                if len(g) == 2:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])):
                        grams.append(' '.join(g))
                else:
                    if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[2])):
                        grams.append(' '.join(g))


    fdist = nltk.FreqDist(grams)


    try:
        names = fdist.most_common()
    except:
        names = ""

    return names
Пример #22
0
def get_everygrams(data: List[List[str]], n : int):
    grams = []
    for idx, sentence in enumerate(data):

        gram = everygrams(
            sentence,
            min_len=2, 
            max_len=n, 
            pad_left=True, 
            pad_right=True,
            left_pad_symbol="<s>",
            right_pad_symbol="</s>")

        grams.append(gram)
    return grams
Пример #23
0
def get_collocations(words):
	# returns n-grams up to trigrams that appear at least 3 times, with pruning of grams that are redundant
	minimum_frequency = 3
	ngrams = {"_".join(ngram): frequency/len(words) for ngram, frequency in FreqDist(everygrams(words, max_len=3)).items() if frequency > minimum_frequency}
	collocations = dict(ngrams)
	for ngram, likelihood in dict(ngrams).iteritems():
		grams = ngram.split("_")
		if len(grams) != 1:
			gram_likelihoods = [ngrams[gram] for gram in grams]
			if likelihood < 0.5 * np.prod(gram_likelihoods)**(1 / len(grams)):
				collocations.pop(ngram, None)
			else:
				for gram in grams:
					collocations.pop(gram, None)
	return sorted(collocations.items(), key=itemgetter(1), reverse=True)
Пример #24
0
def get_collocations(words):
	# returns n-grams up to trigrams that appear at least 3 times, with pruning of grams that are redundant
	minimum_frequency = 3
	ngrams = {"_".join(ngram): frequency/len(words) for ngram, frequency in FreqDist(everygrams(words, max_len=3)).items() if frequency > minimum_frequency}
	collocations = dict(ngrams)
	for ngram, likelihood in dict(ngrams).iteritems():
		grams = ngram.split("_")
		if len(grams) != 1:
			gram_likelihoods = [ngrams[gram] for gram in grams]
			if likelihood < 0.5 * np.prod(gram_likelihoods)**(1 / len(grams)):
				collocations.pop(ngram, None)
			else:
				for gram in grams:
					collocations.pop(gram, None)
	return sorted(collocations.items(), key=itemgetter(1), reverse=True)
def exists(gram,keyword):
    "checks if an n-gram  already exists in the keywords or not"
    keyword=keyword.lower()
    
    #makes 5 n-grams of the keyword to match with the given keyword
    keyword=list(everygrams(keyword.split(), 1, 5))
    new_keyword=[]
    
    for p in keyword:
        p=" ".join(p)
        new_keyword.append(p)
    keyword=new_keyword
    if gram.lower() in keyword:
        return True
    else:
        return False
Пример #26
0
def count_words(s, n_gram_min, n_gram_max, nb_words):
    stop_words = set(stopwords.words("english"))
    l_w = ['aren',"aren't","couldn'",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'don',"don't","hadn't",'hasn',
           "hasn't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'no','nor','not',"shan't",'shouldn',
           "shouldn't",'wasn',"wasn't","won't",'wouldn',"wouldn't"]
    for i in l_w:
        stop_words.discard(i)
    s = s.lower()
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    filtered_sentence = [] 
    for w in tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
    output = list(everygrams(filtered_sentence, n_gram_min, n_gram_max))
    c = Counter(output)
    counts = c.most_common(nb_words)
    return counts
def df_get_tokens(df, col_name, n_gram=1):
    """ Create tokens of 'col_name' returns tokenized_text """
    def _token_creator(sentence):
        replaced_punctation = list(map(lambda token: re.sub(r'[^\wa-zA-Z0-9!?]+', '', token), sentence))
        removed_punctation = list(filter(lambda token: not token.isdigit(), replaced_punctation))
        removed_empty = list(filter(None, removed_punctation))
        
        replace_ = list(map(lambda token: re.sub(r'^_|(\d)+(_$|)|_\W|\W_|_$', '', token), removed_empty))
        replace_ = list(map(lambda token: re.sub(r'^_|_$', '', token), replace_))
        removed_empty = list(filter(None, replace_))
        
        return removed_empty
    
    if n_gram == 1:
        df['tokenized_text'] = list(map(nltk.word_tokenize, df[col_name]))
    else:
        df['tokenized_text'] = df[col_name].apply(lambda x: ['_'.join(ng) for ng in nltk.everygrams(nltk.word_tokenize(x), 1, n_gram)])
    df['tokenized_text'] = list(map(_token_creator, df.tokenized_text))
Пример #28
0
def getDaySubjects(identifier):
    """ Attempt to add subjects based on the 50 most frequent n-grams in this IA item """

    # Cf. https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize
    # https://agailloty.rbind.io/en/project/nlp_clean-text/
    # https://stackoverflow.com/a/58656665
    # print("INFO: Planning to add the following words to {}".format(identifier))
    r = requests.get("https://archive.org/download/{}/{}_djvu.txt".format(
        identifier, identifier))
    tokens = nltk.tokenize.word_tokenize(r.text, language="italian")
    tokens = [
        word.lower() for word in tokens if word.isalnum()
        and not word in nltk.corpus.stopwords.words("italian")
    ]
    grams = nltk.FreqDist(nltk.everygrams(tokens, min_len=2, max_len=5))
    commongrams = [
        " ".join(gram[0]) for gram in grams.most_common() if gram[1] > 3
    ][:50]
    return commongrams
Пример #29
0
def get(filename, bookname):

#opening,reading,translating(replacing),and spliting the sorcerors stone file
    with open(filename,"r",encoding='utf8',errors="ignore") as f:
        text_list = f.read().lower().split()
        
    #--------------------------------------------------------------------------  
    #takes out punctuation lines 32 - 45 
    #stopwords is a list of common words that can be removed
    stop_list = []
    stop_words = list(stopwords.words("english"))
    for word in text_list:
        if word in stop_words:
            pass
        else:
            stop_list.append(word) 
    #--------------------------------------------------------------------------       
    #translation is a built in function which helps with changing
    #the characters which I am using to remove punctuation.   
    trans = str.maketrans("","",'~!@#$%^&*()`,.<>/?\\|[]{};-\n\':"') 
    strip_list = []
    for word in stop_list:
        word = word.translate(trans)
        strip_list.append(word) # adding the single item to a existing list
    #--------------------------------------------------------------------------
    #stem helps with breaking the word down to its base/root
    stem_list = []
    ss = SnowballStemmer("english")
    for word in strip_list:
        word = ss.stem(word)
        stem_list.append(word)
    #--------------------------------------------------------------------------
    #Returns all possible ngrams generated from the text.
    ngram = list(everygrams(stem_list, min_len = 3, max_len = 5))  
    # adds any item to the exisisting list/dictionary
    count_dict = defaultdict(lambda: 0)
    # counting the number of ngrams.
    for tuples in ngram:
        count_dict[tuples] += 1
    
    df = pd.DataFrame(count_dict, index = [bookname])
            
    return df
Пример #30
0
def extract_skills(nlp_text, noun_chunks, skills_file=None):
    '''
    Helper function to extract skills from spacy nlp text

    :param nlp_text: object of `spacy.tokens.doc.Doc`
    :param noun_chunks: noun chunks extracted from nlp text
    :return: list of skills extracted
    '''
    tokens = [token.text for token in nlp_text if not token.is_stop]

    if not skills_file:
        data = pd.read_csv(
            os.path.join(os.path.dirname(__file__), 'skills.csv'))
    else:
        data = pd.read_csv(skills_file)

    skills = list(data.columns.values)
    skills = [s.replace('_', ' ') for s in skills]
    skillset = []

    # generating n-grams from text

    ngram_len = max(len(group.split()) for group in skills)
    list_ngrams = list(everygrams(tokens, 1, ngram_len))

    chunks = [' '.join(words) for words in list_ngrams]

    # process to the matching with the list of skills

    for token in chunks:
        token = token.lower().strip()
        if token in skills:
            skillset.append(token)
        else:
            highest = process.extractOne(token, skills)
            if highest[1] == 100:
                skillset.append(highest[0])

    skill_list = [
        i.replace(' ', '_') for i in set([i.lower() for i in skillset])
    ]
    return sorted(skill_list)
Пример #31
0
    def from_conll_line(self, line):
        parts = line.split('\t')
        word = parts[1].replace(self.special_char, '-')
        lemma = parts[2].replace(self.special_char, '-').lower()
        morphemes = self.word2morphemes[
            lemma].segments if self.word2morphemes and lemma else tuple()

        return Token(
            index=int(parts[0]),
            word=word,
            lemma=lemma,
            pos=parts[3],
            xpos=parts[4],
            morphological_tags=tuple(parts[5].split('|')),
            morphemes=morphemes,
            ngrams=tuple([
                ''.join(g) for g in everygrams(word,
                                               min_len=self.min_ngram_len,
                                               max_len=self.max_ngram_len)
            ]),
        )
Пример #32
0
def build(n, input_file, output_file_name):
  sentences = []
  with codecs.open(input_file,'r',encoding='utf8') as f:
     for line in f:
         sentences.append(line)

  ngrams = []
  exclude = set(string.punctuation)
  for sentence in sentences:
      sentence = ''.join(ch for ch in sentence.lower() if ch not in exclude)
      ngrams.extend(list(everygrams(sentence.split(), max_len=n, min_len=n)))

  unique_ngrams = sorted(set(ngrams))

  output_file = output_file_name + "_unclean.txt"

  with codecs.open(output_file,'w',encoding='utf8') as f:
      for ngram in unique_ngrams:
          line = ngram + (str(ngrams.count(ngram)),)
          for word in line:
              f.write(''.join(word) + ' ')
          f.write('\n')

  return output_file