def get_recall_unordered(recall_list, query_list):
    """
    Determine the fraction of queries in the recall list that were captured by the query list without concern for
    word order, capitalization, or punctuation. Differences in apostrophes (single quotes) will still be considered.

    :param recall_list: list containing all queries from a recall data set
    :param query_list: list containing all automatically generated queries
    :return: fraction of recall queries captured
    """
    tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')
    query_token = []
    for q in query_list:
        l = tokenizer.tokenize(q.lower())
        sorted(l, key=str.lower)
        query_token.append(l)

    num = 0
    for q in recall_list:
        l = tokenizer.tokenize(q.lower())
        sorted(l, key=str.lower)
        if l in query_token:
            num += 1

    if len(recall_list) == 0:
        return 0, 0
    return float(num) / float(len(recall_list)), num
Exemplo n.º 2
0
def remove_stop_word_punctuation(sentence):
    '''
        removes stop word and punctuation from given text

        param sentence: user provided text
        type sentence: str

        returns: preprocessed sentence
        rtype: str
    '''

    stop_words = set(stopwords.words('english'))

    word_tokens = []

    for token in word_tokenize(sentence):
        for t in wordpunct_tokenize(token):
            # remove punctuations
            tokenizer = RegexpTokenizer(r'\w+')
            r = tokenizer.tokenize(t)
            if r:
                word_tokens.append(tokenizer.tokenize(t)[0])

    # remove stop words
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return ' '.join(x for x in filtered_sentence)
    def getText(self):
        filename = self.textEdit.toPlainText()
        symb_remove = RegexpTokenizer(r'\w+')
        list1 = symb_remove.tokenize(filename)
        print("-----------------------------------")
        filename = self.textEdit_2.toPlainText()
        symb_remove = RegexpTokenizer(r'\w+')
        list2 = symb_remove.tokenize(filename)

        print("following is the wordlist of the file")
        print(list1)
        print(list2)

        sims = []
        initialList = []

        for word1, word2 in product(list1, list2):
            syns1 = wordnet.synsets(word1)
            print(syns1)
            syns2 = wordnet.synsets(word2)
            print(syns2)
            for word1 in syns1:
                for word2 in syns2:
                    s = word1.wup_similarity(word2)
                    if str(s) == 'None':
                        s = 0

                    initialList.append(s)
                    print(str(word1) + " second word" + str(word2))
                    print(s)
            print(initialList)
Exemplo n.º 4
0
    def tokenize(self, attr):
        accepted = {
            'title': self.title,
            'description': self.description,
            'cve': self.cve,
            'cwe': self.cwe,
            'refs': self.refs,
            'dsk': self.dsk
        }
        matcher = {
            'title': r'\w+[-\w+]*',
            'description': r'\w+[-\w+]*',
            'cve': r'CVE[\s|-]\d+[\s|-]\d+'
        }

        if attr not in accepted.keys():
            return 'It is not possible to tokenize this plugin attribute.'

        tokenizer = RegexpTokenizer(matcher[attr])
        stop = stopwords.words('english')
        final = []
        if attr == 'title' or attr == 'description':
            intermediate = tokenizer.tokenize(accepted[attr])
            final = [i.lower() for i in intermediate if i not in stop]
        elif attr == 'cve':
            intermediate = tokenizer.tokenize(','.join(accepted[attr]))
            final = [i.lower().replace(' ', '-') for i in intermediate if i not in stop]

        return final
def get_tokens(dict_element):
    # Remove stop words from data and perform initial
    # cleanup for feature extraction

    query = dict_element['query']
    desc = dict_element['product_description']
    title = dict_element['product_title']
    stop = stopwords.words('english')

    pattern = r'''(?x)               # set flag to allow verbose regexps
          ([A-Z]\.)+         # abbreviations, e.g. U.S.A.
          | \$?\d+(\.\d+)?%? # numbers, incl. currency and percentages
          | \w+([-']\w+)*    # words w/ optional internal hyphens/apostrophe
          | @((\w)+([-']\w+))*
          | [+/\-@&*]        # special characters with meanings
        '''

    #pattern = r'[+/\-@&*#](\w+)|(\w+)'
    tokenizer = RegexpTokenizer(pattern)



    #tokenizer = RegexpTokenizer(r'\w+')
    query_tokens = tokenizer.tokenize(query)
    query_tokens = map(lambda x:x.lower(),query_tokens)
    desc_tokens = tokenizer.tokenize(desc)
    desc_tokens = [x.lower() for x in desc_tokens if x.lower() not in stop]
    title_tokens = tokenizer.tokenize(title)
    title_tokens = [x.lower() for x in title_tokens if x.lower() not in stop]

    return query_tokens, title_tokens, desc_tokens
Exemplo n.º 6
0
def summarize(text): 
    tokenizer = RegexpTokenizer(r'\w+')
    formatted_text = tokenizer.tokenize(text)
    sentence_list = nltk.sent_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')
    word_frequencies = {}
    for word in formatted_text:
        if word not in stopwords:
            if word not in word_frequencies:
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    max_freq = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word]/max_freq
    
    sentence_scores = {}
    for sent in sentence_list:
        for word in tokenizer.tokenize(sent.lower()):
            if word in word_frequencies:
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores:
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
    import heapq
    summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)  
    return summary
def Preprocessing(df, contractions):
    pd.options.mode.chained_assignment = None
    contractionsDict = {}
    for i in contractions['data']:
        contractionsDict[i[0]] = i[1]

    # remove url
    df['sentence'] = df['sentence'].str.replace('http\S+|www.\S+',
                                                '',
                                                case=False)

    # remove number
    df['sentence'] = df['sentence'].str.replace('\d+', '')

    # remove hashtags
    df['sentence'] = df['sentence'].str.replace('#(\w+)', '')

    # change all text with contraction
    for index, row in df.iterrows():
        row[1] = ' '.join([
            str(x) for x in [
                contractionsDict[t] if t in contractionsDict.keys() else t
                for t in [e.lower() for e in row[1].split()]
            ]
        ])

    # remove stopword
    stop_words = []
    for word in stopwords.words('english'):
        stop_words.append(word) if ('not' not in word
                                    and 'no' not in word) else stop_words

    # remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    for index, row in df.iterrows():
        word_tokens = tokenizer.tokenize(row[1])
        row[1] = ' '.join(
            [w for w in word_tokens if not w.lower() in stop_words])

    # using lemmetizer
    wordnet_lemmatizer = WordNetLemmatizer()
    for index, row in df.iterrows():
        row[1] = ' '.join(
            wordnet_lemmatizer.lemmatize(t) for t in row[1].split())

    # remove non-english word
    english_words = set(nltk.corpus.words.words())
    for index, row in df.iterrows():
        word_tokens = tokenizer.tokenize(row[1])
        row[1] = " ".join(w for w in word_tokens
                          if w.lower() in english_words or not w.isalpha())

    # remove non-alphabetic characters
    for index, row in df.iterrows():
        word_tokens = tokenizer.tokenize(row[1])
        row[1] = " ".join(w for w in word_tokens if w.isalpha())

    return df
def clean_text(text, stop_words):
    '''Make text lowercase, remove mentions, remove links, convert emoticons/emojis to words, remove punctuation
    (except apostrophes), tokenize words (including contractions), convert contractions to full words,
    remove stop words.'''

    # make text lowercase
    text = text.lower()

    # remove mentions
    text = re.sub("(@[A-Za-z0-9]+)", "", text)

    # remove links
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'pic\.\S+', '', text)

    # convert emoticons
    emoticons = load_dict_emoticons()
    words = text.split()
    words_edit = [
        emoticons[word] if word in emoticons else word for word in words
    ]
    tweet = ' '.join(words_edit)

    # convert emojis
    text = emoji.demojize(text)
    text = text.replace(':', ' ')  # separate emojis-words with space

    # remove punctuation
    text = text.replace('...', ' ')  # special cases
    text = text.replace('-', ' ')
    text = text.translate(
        str.maketrans('', '', '!"$%&*()+,./;<=>?@[\\]^_`{|}~'))

    # tokenize words
    tokenizer = RegexpTokenizer("(#?[a-zA-Z]+[0-9]*(?:'[a-zx]+)?)")
    words = tokenizer.tokenize(text)

    # convert contractions
    contractions = load_dict_contractions()
    words = text.split()
    words_edit = [
        contractions[word] if word in contractions else word for word in words
    ]
    text = ' '.join(words_edit)

    # remove stop words and lemmatize
    lemmatizer = WordNetLemmatizer()
    words = tokenizer.tokenize(text)
    words = [
        lemmatizer.lemmatize(word) for word in words if word not in stop_words
    ]
    text = ' '.join(words)

    return text
Exemplo n.º 9
0
def remove_punctuation(statement):
    """
    Remove any punctuation from the statement text
    """
    # Make tokenizer
    tokenizer = RegexpTokenizer(r"\w+")

    #make a list of words without punctuation
    temp_text = tokenizer.tokenize(statement.text)
    statement.text = " ".join(tokenizer.tokenize(statement.text))
    print(statement.text)
    return statement
Exemplo n.º 10
0
def preprocess(problem: str, code: str):
    """Normalize and tokenize raw problem text and extract normalized comment lines from code file.
    Tokenized using RegexpTokenizer from NLTK.

    Args:
        problem (str): raw problem text from db
        code (str): raw code file, pre-encoded

    Returns:
        tuple: list of tokenized lines of problem text, list of tokenized lines of comments
    """
    tokenizer = RegexpTokenizer(r'\w+')

    # normalize problem text
    # convert to printable representation to reveal carriage return characters
    problem = repr(problem)
    # remove html tags
    problem = re.sub(r'<.*?>', '', problem)
    # remove carriage return characters
    problem = re.sub(r'\\r', ' ', problem)
    # revert from printable representation
    problem = eval(problem)
    problem = problem.split('.')
    problem = [tokenizer.tokenize(line.lower()) for line in problem]

    # trim newline and tab characters
    code = re.sub(r'[\r|\n|\t]+', '\n', code)
    # trim multiple spaces (and to handle codes with leading space as indentation)
    #   this, however, turns all multiple whitespace into one, so we need to strip the remaining leading whitespace afterwards
    code = re.sub(r' +', ' ', code)
    # remove remaining leading whitespace
    code = '\n'.join([line.strip() for line in code.split('\n')])

    # extract comments from code using regex pattern
    # regex pattern from https://www.regexpal.com/94246
    re_pattern = r'/\*[\s\S]*?\*/|([^:]|^)//.*$'
    matcher = re.compile(re_pattern, re.MULTILINE)
    matches = matcher.finditer(code)
    # iterate through found matches while removing newline and leading-trailing whitespaces
    comments = []
    for match in matches:
        comment_line = match.group()
        comments.append(tokenizer.tokenize(comment_line.lower()))

    # get only the codes by removing comments
    code_only = matcher.sub("", code).split('\n')

    return problem, comments, code_only
Exemplo n.º 11
0
    def add_to_index(self, document, doc_id):
        # parser = HTMLParser(text=document['data'])
        text = document['data']

        # print(1)

        nlp = Russian()
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
        tokens = [token.lower() for token in tokens]
        tmp_text = ' '.join(tokens)
        if len(tokens) > 10e5:
            return
        self.doc_iter += 1
        nlp.max_length = 10e7
        doc_text = nlp(tmp_text, disable=['ner', 'parser'])
        lemmas = []
        # for lemma in tokens:
        for s in doc_text:
            lemma = s.lemma_
            lemmas.append(lemma)
            # if lemma not in set(stopwords.words('russian')) \
        #             and lemma not in set(stopwords.words('english')) \
        #             and len(lemma) > 1:
        #         lemmas.append(lemma)
        freq = FreqDist(lemmas)
        for k, v in freq.most_common():
            if k not in self.global_index:
                self.global_index[k] = []
            self.global_index[k].append((doc_id, v))
Exemplo n.º 12
0
def gen_counts(path_corpus, list_corpus):
    """ creates np array, for each corpus file how many words
    in that document """
    # create output
    counts_corpus = np.zeros(len(list_corpus))

    fp = None
    txt = u''
    tokens = []
    tokenizer = RegexpTokenizer(ur'\w+')

    count = 0
    every = 500
    for f in list_corpus:
        # read in text
        fp = codecs.open(path_corpus+f, 'r', "utf-8", errors="ignore")
        txt = fp.read()
        txt = txt.lower()
        fp.close()

        # tokenize
        tokens = tokenizer.tokenize(txt) 
        counts_corpus[list_corpus.index(f)] = len(tokens)
        
        # count interations
        if count % every == 0:
            print(count)
        count += 1

    return counts_corpus
Exemplo n.º 13
0
def read_all_txt_orig(directory):
    all_s = []
    for file in os.listdir(directory):
        full_path = os.path.join(directory, file)
        if not file.endswith(".txt"):
            continue
        with open(full_path) as f:
            captions = f.read().split('\n')
            for cap in captions:
                if len(cap) == 0 or len(cap) == 1:
                    continue
                cap = cap.replace("\ufffd\ufffd", " ")
                # picks out sequences of alphanumeric characters as tokens
                # and drops everything else
                tokenizer = RegexpTokenizer(r'\w+')
                tokens = tokenizer.tokenize(cap.lower())
                # print('tokens', tokens)
                if len(tokens) == 0:
                    print('cap', cap)
                    continue

                tokens_new = []
                for t in tokens:
                    if t == 'thisbirdhasadarkgreybelly':
                        print(123)
                    t = t.encode('ascii', 'ignore').decode('ascii')
                    if len(t) > 0:
                        tokens_new.append(t)
                all_s.append(" ".join(tokens_new) + "\n")
    return all_s
Exemplo n.º 14
0
def filter_sentence(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    word_tokens = tokenizer.tokenize(sentence)

    filtered_words = [w for w in word_tokens if not w in stop_words]
    snowball_result_set = [snowball_stemmer.stem(word) for word in filtered_words]
    return snowball_result_set
    def tensor_vec_pipline(data, word_index, max_len):
        
#Create data maxtrix to be fed to the keras model       
        print("Creating data to feed to tensorflow")
        df_len = len(data)
        indexing_matrix = np.zeros((df_len, max_len), dtype = 'int32')
        r_inc = 0

        tokenizer = RegexpTokenizer(r'\w+')

        for index, row in data.iterrows():
    
            sentence = row['sentence']
            sen_tokenize = tokenizer.tokenize(sentence)
    
            c_inc = 0
            
            for word in sen_tokenize:
                try:
                    indexing_matrix[r_inc][c_inc] = word_index[word]
                except Exception as e:
                    #print(e, word)
                    if (str(e) == word):
                        indexing_matrix[r_inc][c_inc] = 0
                continue
        
            c_inc = c_inc + 1
        r_inc = r_inc + 1
        print("Run complete")
        
        return indexing_matrix
Exemplo n.º 16
0
def french_tokenizer(text):
    from nltk import RegexpTokenizer
    tokenizer = RegexpTokenizer(r"(?u)\b\w\w+\b")
    toks = tokenizer.tokenize(text)
    # We also lemmatize!
    # toks = [fr_lexicon.get(t, t) for t in toks]
    return toks
Exemplo n.º 17
0
def read_and_clean_training_data(file):
    """
    The following function reads the training data and split them based the label
    :param file:
    :return:
    """
    with open(file, encoding="utf8", errors="ignore") as f:
        stop_words = set(stopwords.words('english'))
        lines = f.readlines()
        x_train = []
        y_train = []
        tokenizer = RegexpTokenizer(r'\w+')
        for i in range(len(lines)):
            line = lines[i]
            if i == 0:
                continue
            else:
                data, label = line.split('\t')
                label = label.strip()
                tokenized_data = tokenizer.tokenize(data)
                cleaned_data = [
                    word.lower() for word in tokenized_data
                    if not word.isdigit() and word != "ml"
                    and word not in stop_words
                ]
                final_data = " ".join(cleaned_data)
                x_train.append(final_data)
                y_train.append(label)
        return x_train, y_train
Exemplo n.º 18
0
def clean_test_data(test_data, test_labels):
    """
    Using that function you are able to read the test files and make the predictions
    :param test_data: test data file
    :param test_labels: test labels file
    :return:
    """
    xtest = []
    ytest = []
    with open(test_data, encoding="utf8", errors="ignore") as data:
        stop_words = set(stopwords.words('english'))
        tokenizer = RegexpTokenizer(r'\w+')
        lines = data.readlines()
        for i in range(len(lines)):
            line = lines[i]
            if i == 0:
                continue
            else:
                tokenized_line = tokenizer.tokenize(line)
                cleaned_data = [
                    word.lower() for word in tokenized_line
                    if not word.isdigit() and word != "ml"
                    and word not in stop_words
                ]
                xtest.append(" ".join(cleaned_data))
    with open(test_labels, encoding="utf8", errors="ignore") as labels:
        ydata = labels.readlines()
        for label in ydata:
            ytest.append(label.strip())
    return xtest, ytest
class RawLemmaTokenizer(object):
    def __init__(self):
        self.tokenizer = RegexpTokenizer(u'(?u)\\b\\w\\w+\\b')
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in self.tokenizer.tokenize(doc)]
Exemplo n.º 20
0
    def regex_tokenizer(self, sent, whole_sent=False):
        regex_tokenizer = RT('\w+|\[M:.*?\]|[\(\)\.\,;\?\!]|\S+')
        tokens = regex_tokenizer.tokenize(sent)
        i = 0
        j = len(tokens) - 1

        # combine abbreviations with their period;
        # separation is a result of tokenizing the sentence
        while i < j:
            if re.match(self.abbrev_pattern,
                        tokens[i]) and tokens[i + 1] == '.':
                tokens[i:i + 2] = [''.join(tokens[i:i + 2])]
                j -= 1

            i += 1

        # return tokenized sentence minus stopword and short words
        if not whole_sent:
            return [
                t for t in tokens if not t in self.stop_words and len(t) > 2
            ]

        # return entire tokenized sentence
        else:
            return [t for t in tokens]
Exemplo n.º 21
0
def gen_vocab(vocab_fname, path):
    print("\ngen_vocab:{}".format(vocab_fname))
    """ reads in a csv file, 
        outputs as python list in given path
        as pickled object. unicode.
        Also add unigrams for every line"""
    # open file pointer
    f = codecs.open(path+vocab_fname, 'r', "utf-8")
            
    # output list
    concepts = []
                        
    # read in lines
    for line in f.readlines():
        concepts = concepts + line.lower().strip("\n").split(',')

    # from observation the concept lists all had ''
    while ('' in concepts):
        concepts.remove('')

    # add unigrams to concepts. does not preserve order of list
    unigrams = set()
    set_concepts = set(concepts)
    tokenizer = RegexpTokenizer(ur'\w+')
                
    for phrase in concepts:
        unigrams.update(tokenizer.tokenize(phrase))
                                    
    set_concepts.update(unigrams)
    return list(set_concepts)
def analyze_dataset():
    l_sentences = []
    with open('/Users/miljan/PycharmProjects/thesis-shared/data/pang_and_lee_data/rt-negative.txt') as file1:
        r = reader(file1, dialect='excel-tab')
        for row in r:
            l_sentences.append(row[0])
    with open('/Users/miljan/PycharmProjects/thesis-shared/data/pang_and_lee_data/rt-positive.txt') as file2:
        r = reader(file2, dialect='excel-tab')
        for row in r:
            l_sentences.append(row[0])

    # chunk the given text into sentences
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    d_lengths = defaultdict(int)
    tokenizer2 = RegexpTokenizer(r'\w+')

    # clean sentences from punctuation
    l_sentences = [''.join(ch for ch in sent if ch not in set(string.punctuation)) for sent in l_sentences]
    l_sentences = [len(tokenizer2.tokenize(sen)) for sen in l_sentences]
    total_sent = len(l_sentences)
    d_lengths = Counter(l_sentences)

    print total_sent
    lengths = sorted(d_lengths.iteritems(), key=lambda key_value: int(key_value[0]))
    plot(lengths)
Exemplo n.º 23
0
def create_bag_of_words(document_list):
    """
    Creates a bag of words representation of the document list given. It removes
    the punctuation and the stop words.

    :type document_list: list[str]
    :param document_list:
    :rtype: list[list[str]]
    :return:
    """
    tokenizer = RegexpTokenizer(r'\w+')
    cached_stop_words = set(stopwords.words("english"))
    body = []
    processed = []

    # remove common words and tokenize
    # texts = [[word for word in document.lower().split() if word not in stopwords.words('english')]
    #          for document in reviews]

    for i in range(0, len(document_list)):
        body.append(document_list[i].lower())

    for entry in body:
        row = tokenizer.tokenize(entry)
        processed.append([word for word in row if word not in cached_stop_words])

    return processed
Exemplo n.º 24
0
def clean_text(t):
    sentence = t.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = filter(
        lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered_words)
    def processText(self,Estr):
        # ① 去除HTML标签
        content = re.sub(r'<[^>]*>', ' ', Estr)

        # ② 除去标点符号,等非字母的字符
        tokenizer = RegexpTokenizer(r'[a-z]+')
        raw = str(content).lower()
        content = tokenizer.tokenize(raw)

        # ③ 去除停用词
        # 获取英语的停用词表
        en_stop = stopwords.words('english')  # get_stop_words('en')
        # 获取自己的停用词表
        # file = os.getcwd()+"\\..\\datasets\\stopwords.txt"
        # f = open(file, "r")
        # mystopwords = f.read()
        # mystopwords= mystopwords.split('\n')
        # for word in mystopwords:
        #     en_stop.add(word)
        # 去除文本中的停用词
        stopped_tokens = [i for i in content if not i in en_stop]

        # ④ 按长度过滤
        content = [i for i in stopped_tokens if len(i) > 2]

        return content
Exemplo n.º 26
0
def prep_text_to_stem(text):
    """
    Remove partes indesejadas como números e palavras na stop_list. Além disso adicionar # ao final da
    palavra a fim de facilitar no stems de uma única letra
    :param text:
    :return:
    """
    text = list(filter(lambda x: type(x) == str, text))

    tokenizer = RegexpTokenizer(r'\w+', flags=re.UNICODE)
    tokens = tokenizer.tokenize(' '.join(text).lower())

    new_tokens = []

    stop_list = Counter(tokens).most_common(300)
    stop_list = [tup[0] for tup in stop_list]
    stop_list.append('series([],')

    for token in tokens:
        if token not in stop_list:
            token = ''.join(
                [letter for letter in token if not letter.isdigit()])
            for pun in punct:
                token.replace(pun, '')
            new_token = token + '#'
            new_tokens.append(new_token)

    return ' '.join(new_tokens)
def text2sents(text, lemmatize=False, stemmer=None):
    """
    converts a text into a list of sentences consisted of normalized words
    :param text: list of string to process
    :param lemmatize: if true, words will be lemmatized, otherwise -- stemmed
    :param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False
    :return: list of lists of words
    """
    sents = sent_tokenize(text)

    tokenizer = RegexpTokenizer(r'\w+')

    if lemmatize:
        normalizer = WordNetLemmatizer()
        tagger = PerceptronTagger()
    elif stemmer is None:
        normalizer = PorterStemmer()
    else:
        normalizer = stemmer

    sents_normalized = []

    for sent in sents:
        sent_tokenized = tokenizer.tokenize(sent)
        if lemmatize:
            sent_tagged = tagger.tag(sent_tokenized)
            sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged]
        else:
            sent_normalized = [normalizer.stem(w) for w in sent_tokenized]

        sents_normalized.append(sent_normalized)
    return sents_normalized
Exemplo n.º 28
0
class Preprocessor(object):
    def __init__(self, max_workers=4):
        self.max_workers = max_workers
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.en_stopwords = set(get_stop_words('en'))
        self.p_stemmer = PorterStemmer()

    def preprocess_doc(self, doc):
        tokens = self.tokenizer.tokenize(doc.lower())

        stopped_tokens = [i for i in tokens if i not in self.en_stopwords]

        stemmed_tokens = [self.p_stemmer.stem(i) for i in stopped_tokens]

        return stemmed_tokens

    def process_docs(self, doc_list):
        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
            return [self.preprocess_doc(doc) for doc in doc_list]

    def preprocess_doc_with_url(self, doc_with_url):
        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
            url, content = doc_with_url

        return url, self.preprocess_doc(content)

    def process_docs_with_urls(self, urldoc_list):
        return [self.preprocess_doc_with_url(urldoc) for urldoc in urldoc_list]
def _get_ngram_features(infile, ngram_size):
    """
    Returns a dictionary containing ngrams and counts observed in a given file

    :param infile: file to be analysed
    :param ngram_size: ngram size
    :return: dict of ngrams/counts
    """
    # tokenizer which remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    # dictionary on ngrams and counts
    d_ngrams = defaultdict(int)
    # stopwords
    stops = set(stopwords.words("english"))
    # lemmatizer for stemming
    lemmatizer = WordNetLemmatizer()

    # load train data
    with open(infile) as tsv:
        file_reader = reader(tsv, dialect="excel-tab")
        # skip title line
        file_reader.next()
        for line in file_reader:
            s_text = line[2]
            # remove punctuation and tokenize
            l_text = tokenizer.tokenize(s_text)
            # remove stopwords and stem
            l_text = [lemmatizer.lemmatize(word) for word in l_text if word not in stops]
            # get the ngrams for the given line
            l_temp = ngrams(l_text, ngram_size)
            for ngram in l_temp:
                d_ngrams[ngram] += 1

    return d_ngrams
def prepare_text(text: pd.Series) -> pd.Series:
    """
    Naive approach to text cleaning. Strip out HTML, then do relatively strict
    preparation (lemmatization, stopwords)

    :param text: series of all relevant text data
    """
    # first, remove html tags
    wo_html = text.apply(lambda x: BeautifulSoup(x, "lxml").text)

    tokenizer = RegexpTokenizer(r'\w+')
    stopword_set = set(stopwords.words('english'))
    lmtzr = WordNetLemmatizer()

    clean_text = []
    pbar = tqdm(range(len(text)), desc='clean_text')
    for d in wo_html:
        dlist = d.lower()
        dlist = tokenizer.tokenize(dlist)
        dlist = list(set(dlist).difference(stopword_set))
        # filter tokens
        filtered_tokens = []
        for token in dlist:
            if re.search('^[a-zA-Z]+$', token) and len(token) >= 4:
                filtered_tokens.append(token)
        # lemmatize
        stems = [lmtzr.lemmatize(t) for t in filtered_tokens]
        final_stems = [stem for stem in stems if len(stem) > 3]
        clean_text.append(final_stems)
        pbar.update()
    pbar.close()
    return clean_text
Exemplo n.º 31
0
    def preprocessing(self):
        self.df = pd.read_csv('static/models/resampled_comments_1.csv')
        self.comments = self.df[['comment', 'rating', 'sentiment']]
        self.comments['comment'] = self.comments['comment'].map(
            lambda x: x.lower())

        toknizer = RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')
        token = self.comments.apply(
            lambda row: toknizer.tokenize(row['comment']), axis=1)

        stop_words = set(stopwords.words('french'))
        stop_token = token.apply(
            lambda x: [item for item in x if item not in stop_words])

        stemmer = SnowballStemmer(language='french')
        stemm = stop_token.apply(lambda x: [stemmer.stem(y) for y in x])

        lemmatizer = FrenchLefffLemmatizer()
        lemm = stemm.apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

        for i in range(len(lemm)):
            lemm[i] = ' '.join(lemm[i])

        self.comments['lemmatiser_com'] = lemm
        data = self.comments[['comment', 'lemmatiser_com', 'sentiment']]

        self.df = pd.DataFrame(data)
        return self.df
Exemplo n.º 32
0
def test():
    global N, words, network

    print 'In testing.'

    gettysburg = """Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth."""
    tokenizer = RegexpTokenizer('\w+')
    gettysburg_tokens = tokenizer.tokenize(gettysburg) 

    samples = []
    for token in gettysburg_tokens:
        word = token.lower()
        if word not in ENGLISH_STOP_WORDS and word not in punctuation:
            samples.append(word)

    dist = FreqDist(samples)
    V = Vol(1, 1, N, 0.0)
    for i, word in enumerate(words):
        V.w[i] = dist.freq(word)

    pred = network.forward(V).w
    topics = []
    while len(topics) != 5:
        max_act = max(pred)
        topic_idx = pred.index(max_act)
        topic = words[topic_idx]

        if topic in gettysburg_tokens:
            topics.append(topic)
    
        del pred[topic_idx]

    print 'Topics of the Gettysburg Address:'
    print topics
Exemplo n.º 33
0
def french_tokenizer(text):
    from nltk import RegexpTokenizer
    tokenizer = RegexpTokenizer(r"(?u)\b\w\w+\b")
    toks = tokenizer.tokenize(text)
    # We also lemmatize!
    # toks = [fr_lexicon.get(t, t) for t in toks]
    return toks
Exemplo n.º 34
0
def analyze_dataset():
    l_sentences = []
    with open(
            '/Users/miljan/PycharmProjects/thesis-shared/data/pang_and_lee_data/rt-negative.txt'
    ) as file1:
        r = reader(file1, dialect='excel-tab')
        for row in r:
            l_sentences.append(row[0])
    with open(
            '/Users/miljan/PycharmProjects/thesis-shared/data/pang_and_lee_data/rt-positive.txt'
    ) as file2:
        r = reader(file2, dialect='excel-tab')
        for row in r:
            l_sentences.append(row[0])

    # chunk the given text into sentences
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    d_lengths = defaultdict(int)
    tokenizer2 = RegexpTokenizer(r'\w+')

    # clean sentences from punctuation
    l_sentences = [
        ''.join(ch for ch in sent if ch not in set(string.punctuation))
        for sent in l_sentences
    ]
    l_sentences = [len(tokenizer2.tokenize(sen)) for sen in l_sentences]
    total_sent = len(l_sentences)
    d_lengths = Counter(l_sentences)

    print total_sent
    lengths = sorted(d_lengths.iteritems(),
                     key=lambda key_value: int(key_value[0]))
    plot(lengths)
Exemplo n.º 35
0
def clean_text(text, stop_words):
    '''Make text lowercase, tokenize words and words with apostrophes, convert contractions to full words,
    lemmatize by POS tag, remove stop words and words shorter than 3 letters.'''
    
    # make text lowercase
    text = text.lower().replace("’", "'")

    # initial tokenization to remove non-words
    tokenizer = RegexpTokenizer("([a-z]+(?:'[a-z]+)?)")
    words = tokenizer.tokenize(text)

    # convert contractions
    contractions = load_dict_contractions()
    words = [contractions[word] if word in contractions else word for word in words]
    text = ' '.join(words)

    # remove stop words, lemmatize using POS tags, and remove two-letter words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in nltk.word_tokenize(text) \
             if word not in stop_words]
    
    # removing any words that got lemmatized into a stop word
    words = [word for word in words if word not in stop_words]
    words = [word for word in words if len(word) > 2]
    text = ' '.join(words)
    
    return text
Exemplo n.º 36
0
def analyze_articles():
    json_document = _read_json_articles()
    l_articles = [
        json_document[i]['_source']['content']
        for i in range(len(json_document))
    ]

    # chunk the given text into sentences
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    d_lengths = defaultdict(int)
    tokenizer2 = RegexpTokenizer(r'\w+')
    total_sent = 0

    for article in l_articles:
        l_sentences = tokenizer.tokenize(article)
        # clean sentences from punctuation
        l_sentences = [
            ''.join(ch for ch in sent if ch not in set(string.punctuation))
            for sent in l_sentences
        ]
        l_sentences = [len(tokenizer2.tokenize(sen)) for sen in l_sentences]
        total_sent += len(l_sentences)
        d_counts = Counter(l_sentences)
        for key in d_counts.keys():
            d_lengths[str(key)] += d_counts[key]
    print total_sent
    lengths = sorted(d_lengths.iteritems(),
                     key=lambda key_value: int(key_value[0]))
    plot(lengths)
def frequencyAnalyse(polarised_tweets : Dict):
    positive_words = {}
    negative_words = {}
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = list(stopwords.words('english'))
    for i in polarised_tweets:
        word_pit =  tokenizer.tokenize(polarised_tweets[i][0])
        tags = nltk.pos_tag(word_pit)
        for word in tags:
            if word[0] in positive_words:
                positive_words[word[0]] += 1
                continue
            elif word[0] in negative_words:
                negative_words[word[0]] += 1
                continue
            if len(word[0]) < 3:
                continue
            if word[0].lower() in stop_words:
                continue
            if word[1] in ['JJ']:
                if polarised_tweets[i][1] > 0.2:  #Positive
                    positive_words[word[0].lower()] = 1
                elif polarised_tweets[i][1] < -0.2:  #Negative
                    negative_words[word[0].lower()] = 1
    for w in sorted(negative_words, key=negative_words.get, reverse=True):
        print(w, negative_words[w])
    return (positive_words, negative_words)
Exemplo n.º 38
0
def get_documents_text(act_id, **kwargs):
    """
    Returns the concatenated, tag-stripped text of all documents related to act_id
    """
    db_conn = kwargs['db']

    italian_stops = set(stopwords.words('italian'))


    cursor = db_conn.cursor(MySQLdb.cursors.DictCursor)
    sql = """
        select d.testo
         from opp_documento as d
         where d.atto_id=%s
    """
    cursor.execute(sql, act_id)
    rows = cursor.fetchall()
    cursor.close()

    testo = u''
    for row in rows:
        # strip html tags from texts, if present
        testo += unicode(
            strip_tags(
                row['testo']
            )
        )

    # remove stopwords
    tokenizer = RegexpTokenizer("[\w]+")
    words = tokenizer.tokenize(testo)
    filtered_testo = " ".join([word for word in words if word.lower() not in italian_stops])

    return filtered_testo
Exemplo n.º 39
0
    def tokenize(self, string):
        # Supression des espaces non nécessaires
        space = re.compile(r' +')
        string = re.sub(space, ' ', string)

        # Harmonisation des numéros de téléphone
        tel = re.compile(
            r'(?P<sep1>0[0-9])( |/+|\-|\\+)(?P<sep2>[0-9]{2})( |/+|\.|\-|\\+)(?P<sep3>[0-9]{2})( |/+|\.|\-|\\+)(?P<sep4>[0-9]{2})( |/+|\.|\-|\\+)(?P<sep5>[0-9]{2})'
        )
        string = tel.sub(r'\g<sep1>.\g<sep2>.\g<sep3>.\g<sep4>.\g<sep5>',
                         string)

        # Tokenisation
        # Le tokenizer supprime automatiquement les caractères suivant isolés : `^ ° ¤ ¨
        # Reconnait comme token :
        # - Email
        # - Site web, nom de domaine, utilisateur etc
        # - Numéro de téléphone réduit
        # - Nom composé
        # - Mot courant
        # - Ponctuation
        tokenizer = RegexpTokenizer(
            r'''([Aa]ujourd'hui|\w+'|[a-zA-ZÀ-Ÿà-ÿ0-9_\.\-]+@[a-zA-ZÀ-Ÿà-ÿ0-9\-\.]+\.[a-zA-ZÀ-Ÿà-ÿ0-9]+|[a-zA-ZÀ-Ÿà-ÿ0-9:@%/;$~_?\+\-=\\\.&\|£€]+[a-zA-ZÀ-Ÿà-ÿ0-9#@%/$~_?\+\-=\\&\|£€]+|[\wÀ-Ÿà-ÿ]+[/\-][\wÀ-Ÿà-ÿ]+|[\wÀ-Ÿà-ÿ0-9]+|\.\.\.|[\(\)\[\]\{\}\"\'\.,;\:\?!\-\_\*\#\§=+<>/\\])'''
        )
        tokens = tokenizer.tokenize(string)
        return tokens
Exemplo n.º 40
0
 def tokenize(text):
     """
     Input: "Body of text...:
     Output: [word, ...] list of tokenized words matching regex '\w+'
     """
     tokenizer = RegexpTokenizer(r'\w+')
     tokens = tokenizer.tokenize(text)
     return tokens
Exemplo n.º 41
0
 def tokenize(self, text):
     """
     tokenise text using nltk RegexpTokenizer
     :param text:
     :return: list of tokens
     """
     tokenizer = RegexpTokenizer(self.pattern)
     tokens = tokenizer.tokenize(text)
     return tokens
Exemplo n.º 42
0
class StemTokenizer(object):

    def __init__(self):
        self.wnl = PorterStemmer()
        self.mytokenizer = RegexpTokenizer('\\b\\w+\\b')

    def __call__(self, doc):
        #return [self.wnl.stem(t) for t in word_tokenize(doc)]
        return [self.wnl.stem(t) for t in self.mytokenizer.tokenize(doc)]
Exemplo n.º 43
0
class StemTokenizer(object):
    def __init__(self):
        from nltk import RegexpTokenizer
        from nltk.stem import PorterStemmer

        self.wnl = PorterStemmer()
        self.mytokenizer = RegexpTokenizer('\\b\\w+\\b')

    def __call__(self, doc):
        return [self.wnl.stem(t) for t in self.mytokenizer.tokenize(doc)]
 def tokenize(self, text):
     """
     :param tweet_list:
     :type list:
     :return: tokens
     This tokenizer uses the nltk RegexpTokenizer.
     """
     tokenizer = RegexpTokenizer(self.pattern)
     tokens = tokenizer.tokenize(text)
     return tokens
Exemplo n.º 45
0
 def __call__(self, doc ):
 	from nltk.tokenize import RegexpTokenizer 
 	from nltk.corpus import stopwords
 	#tokenizer = RegexpTokenizer(r'\w+')
 	tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
     #words=[self.wnl.lemmatize(t) for t in word_tokenize(doc)]
     words=[self.wnl.lemmatize(t) for t in tokenizer.tokenize(doc)]
     mystops=(u'youtube',u'mine',u'this',u'that','facebook','com','google','www','http','https')
     stop_words=set(stopwords.words('english'))
     stop_words.update(mystops)
     
     stop_words=list(stop_words)
     return [i.lower() for i in words if i not in stop_words]
def tokenizeWords(corpus_root):
    wordlists = PlaintextCorpusReader(corpus_root, '.*')
    tokenizer = RegexpTokenizer(r'\w+')
    # for fileid in wordlists.fileids():
    #     sentimentText=wordlists.raw(fileid).lower()
    #     tokenizedWords=tokenizer.tokenize(sentimentText)
    #     tokenizedTextWithoutStopWords=removeAllStopWords(tokenizedWords)
    #
    #     print(tokenizedTextWithoutStopWords)
    #     if "positive" in corpus_root:
    #         print("positive documents")
    #         #posfeats.update(word_feats(tokenizedTextWithoutStopWords),'pos')
    #         #posfeats =posfeats+[word_feats(tokenizedTextWithoutStopWords), 'pos']
    #         posfeats[word_feats(tokenizedTextWithoutStopWords)]='pos'
    #
    #     if "negative" in corpus_root:
    #         negfeats.update(word_feats(tokenizedTextWithoutStopWords),'neg')
    if "negative" in corpus_root:
        negfeats = [(word_feats(removeAllStopWords(tokenizer.tokenize(wordlists.raw(f).lower()))), 'neg') for f in wordlists.fileids()]
    if "positive" in corpus_root:
        posfeats = [(word_feats(removeAllStopWords(tokenizer.tokenize(wordlists.raw(f).lower()))), 'pos') for f in wordlists.fileids()]
        print(posfeats)
Exemplo n.º 47
0
def tokenize_and_stem(doc):
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    tokens = tokenizer.tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in en_stop and len(token) > 2]
    final = [p_stemmer.stem(word) for word in clean]


    return final
Exemplo n.º 48
0
def gen_doc_term_counts(path_corpus, list_corpus, list_vocab):
    print("\ngen_doc_term_counts:{}".format(path_corpus))
    """ generates document-term matrix given a path to a 
        corpus and common vocab
    """
    num_docs = len(list_corpus)
    num_terms = len(list_vocab)
    doc_term = np.zeros((num_docs, num_terms))
    counts_corpus = np.zeros(num_docs)

    # generate (dict) compiled regex's
    re_c_vocab = gen_regex_c(list_vocab)
    tokenizer = RegexpTokenizer(ur'\w+')

    # iterate over files
    fp = None
    txt = u''
    r = None
    num = 0.0
    tokens = []
        
    count = 0
    every = 50
    start= timeit.default_timer()
    checkpoint = 0.0
    for i in range(num_docs):
        fp = codecs.open(path_corpus+list_corpus[i], 'r', "utf-8", errors="ignore")
        txt = fp.read()
        txt = txt.lower()
        fp.close()
        
        # tokenize
        tokens = tokenizer.tokenize(txt) 
        counts_corpus[i] = len(tokens)
        
        # count number terms
        for j in range(num_terms):
            r = re_c_vocab[ list_vocab[j] ]
            num = len(r.findall(txt, re.UNICODE))  
            doc_term[i,j] = num
            
        if (count % every == 0):
            checkpoint = timeit.default_timer()
            print(count, round(checkpoint-start, 2))
        count += 1
            
    
    return (doc_term, counts_corpus)
Exemplo n.º 49
0
 def __call__(self, doc ,string_tokenize='[a-zA-Z0-9]+'):
 	from nltk.tokenize import RegexpTokenizer 
 	from nltk.corpus import stopwords
 	from nltk.corpus import wordnet as wn
 	#tokenizer = RegexpTokenizer(r'\w+')
 	tokenizer = RegexpTokenizer(string_tokenize)
     #words=[self.wnl.lemmatize(t) for t in word_tokenize(doc)]
     words=[self.wnl.lemmatize(t) for t in tokenizer.tokenize(doc)]
     mystops=(u'youtube',u'mine',u'this',u'that')
     stop_words=set(stopwords.words('english'))
     stop_words.update(mystops)
     stop_words=list(stop_words)
     words1= [i.lower() for i in words if i not in stop_words]
     words2= list(set(list({l.name() for word in words1 for s in wn.synsets(word) for l in s.lemmas()})+words1))
     
     return [i.lower() for i in words2 if i not in stop_words]
Exemplo n.º 50
0
def build_vector(text, neutral):
    # We tokenize the text 
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    if neutral:
        tokens = pos_tag(tokens) # we add POS tag
        forbidden_pos = ['RB', 'RBS', 'RBR', 'CC', 'CD', 'DT', 'EX', 'IN', 'LS', 'PDT', 'PRP', 'PRP$', 'RP', 'SYM', 'TO', 'WDT', 'WP', 'WP$', ]
 
    # We build the document vector
    vector = set()
    for couple in tokens:

        if neutral:
            if (couple[1] in forbidden_pos):
                continue
        
            vector.add(lemmatize(couple[0]))
        else:
            vector.add(lemmatize(couple))
		
    return vector
Exemplo n.º 51
0
def test(): 
    gt = GetTweets()
    documents = gt.get_hashtag('ferguson', count=20)
    documents += gt.get_hashtag('police', count=21)
    print 'Query:', documents[-1]

    tokenizer = RegexpTokenizer('\w+')
    vols = []
    for doc in documents:
        samples = []
        for token in tokenizer.tokenize(doc):
            word = token.lower()
            if word not in ENGLISH_STOP_WORDS and word not in punctuation:
                samples.append(word)
        vols.append(volumize(FreqDist(samples)))

    vectors = [ doc_code(v) for v in vols[:-1] ]
    query_vec = doc_code(vols[-1])

    sims = [ cos(v, query_vec) for v in vectors ]
    m = max(sims)
    print m, documents[sims.index(m)]
def analyze_articles():
    json_document = _read_json_articles()
    l_articles = [json_document[i]['_source']['content'] for i in range(len(json_document))]

    # chunk the given text into sentences
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    d_lengths = defaultdict(int)
    tokenizer2 = RegexpTokenizer(r'\w+')
    total_sent = 0

    for article in l_articles:
        l_sentences = tokenizer.tokenize(article)
        # clean sentences from punctuation
        l_sentences = [''.join(ch for ch in sent if ch not in set(string.punctuation)) for sent in l_sentences]
        l_sentences = [len(tokenizer2.tokenize(sen)) for sen in l_sentences]
        total_sent += len(l_sentences)
        d_counts = Counter(l_sentences)
        for key in d_counts.keys():
            d_lengths[str(key)] += d_counts[key]
    print total_sent
    lengths = sorted(d_lengths.iteritems(), key=lambda key_value: int(key_value[0]))
    plot(lengths)
Exemplo n.º 53
0
def create_bag_of_words(document_list):
    """
    Creates a bag of words representation of the document list given. It removes
    the punctuation and the stop words.

    :type document_list: list[str]
    :param document_list:
    :rtype: list[list[str]]
    :return:
    """
    tokenizer = RegexpTokenizer(r'\w+')
    tagger = nltk.PerceptronTagger()
    cached_stop_words = set(stopwords.words("english"))
    cached_stop_words |= {
        't', 'didn', 'doesn', 'haven', 'don', 'aren', 'isn', 've', 'll',
        'couldn', 'm', 'hasn', 'hadn', 'won', 'shouldn', 's', 'wasn',
        'wouldn'}
    body = []
    processed = []

    for i in range(0, len(document_list)):
        body.append(document_list[i].lower())

    for entry in body:
        row = tokenizer.tokenize(entry)
        tagged_words = tagger.tag(row)

        nouns = []
        for tagged_word in tagged_words:
            if tagged_word[1].startswith('NN'):
                nouns.append(tagged_word[0])

        nouns = [word for word in nouns if word not in cached_stop_words]
        processed.append(nouns)

    return processed
class StemmingTokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.tokenizer = RegexpTokenizer(u'(?u)\\b[a-z]+\\-*[a-z]+|\\b(?u)\\b[a-z]\\b')
    def __call__(self, doc):
        return [self.stemmer.stem(tokens.lower()) for tokens in self.tokenizer.tokenize(doc)]
Exemplo n.º 55
0
    nyt_labels.append(line[4])
nyt.close()

f = open('/home/mikhail/Documents/research/hierarchical_classification/Inter_Observ/Interv.csv', 'a')

writer = csv.writer(f)
countNCT = 0
countSCRT = 0
ope = 0

try:
    for i in range(0,len(nyt_data),1):
        text = nyt_data[i]+" "+nyt_data1[i]
        try:
            observ = RegexpTokenizer("NCT[0-9]{8}")
            obs = observ.tokenize(nyt_labels[i])
            if len(obs) > 0:
                page = urllib2.urlopen('http://clinicaltrials.gov/show/'+obs[0]+'?resultsxml=true')
                document = ElementTree.parse(page)
                page_content = page.read()

                study_design = document.findtext('study_type')
                writer.writerow((text.replace("\'","").replace("\"","").replace("\\","").replace("\/",""),study_design))
                print("NORMAL"+" "+nyt_labels[i]+" "+study_design)
                countNCT +=1
        except Exception,e:
            print(nyt_labels[i])
    #error2_writer.writerow((encode_id,encode_date,encoded_user,encoded_str,NCT))
            print e
            pass
Exemplo n.º 56
0
    pformat(lcwd)

def filter_stopwords(words):
    important_words = filter(lambda x: x not in stopwords.words('english'), words)
    return important_words

#Parsing the command line arguments for the filename 
parser = argparse.ArgumentParser(description = 'Process a text file.')
parser.add_argument('filename', type=str, help='pathname to that file')
parser.add_argument('cut_off', type=int, help='cut off value for the list output')
args = parser.parse_args()
print args

filename = args.filename
#open the file 
pp = open(filename)
CUTOFF = args.cut_off
# string of the text 
text = pp.read()
# the list of words 
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)

print "Without filtering out the stop words"
process_words(tokens)

print "With stop words filtering" 

fil_token = filter_stopwords(tokens)
process_words(fil_token)
Exemplo n.º 57
0
			tokens.append(ngram)
	return tokens

indonesian = []
malaysian = []
tamil = []
others = []

lang_dict = {'indonesian' : indonesian, 'malaysian' : malaysian, 'tamil' : tamil, 'others' : others}

file_content = getFileContents('input.train.txt')
for line in file_content:
	try:
		#tokens = nltk.word_tokenize(line)
		tokenizer = RegexpTokenizer(r'\w+')
		tokens = tokenizer.tokenize(line)
		language = tokens[0]
		del tokens[0]
		ngrams = getNgrams(tokens)
		for gram in ngrams:
			lang_dict[language].append(gram)
	except UnicodeEncodeError:
		pass

for key,value in lang_dict.items():
	with open('stuff.txt', 'a') as f:
		f.write('\n')
		f.write(key)
		f.write('\n')
		for k in value:
			try:
Exemplo n.º 58
0
def stemming(doc):

    wnl = PorterStemmer()
    mytokenizer = RegexpTokenizer('\\b\\w+\\b')

    return [wnl.stem(t) for t in mytokenizer.tokenize(doc)]
Exemplo n.º 59
0
from nltk.corpus import cess_esp as cess
from nltk import RegexpTokenizer
import nltk
import pickle

# My sentences
sentence = "hola, hola, soy Pedro ¿como te llamas?."
tokenizer = RegexpTokenizer(r'\w+')
tokenized_words = tokenizer.tokenize(sentence)

# Dec train/test
train = None
test = None
cess_sents = cess.tagged_sents()
try:
    with open('test_pickles/test_data.pickle', 'rb') as fa:
        div = pickle.load(fa)
        train = cess_sents[:div]
        test = cess_sents[div+1:]
except FileNotFoundError as a:
    # training data
    print("dumping train/test")
    div = len(cess_sents)*90//100
    train = cess_sents[:div]
    test = cess_sents[div+1:]

    with open('test_pickles/test_data.pickle', 'wb') as fb:
        pickle.dump(div, fb)

#####
#