Python strip_non_alphanum 예제들, gensim.parsing.preprocessing.strip_non_alphanum Python 예제들

예제 #1

0

파일 보기

파일: NLP_text_feature_extraction.py 프로젝트: bezhilan/bezhilan.github.io

def preprocessing(text):
    '''Preprocesses a text using standard gensim techniques: 
    removes stopwords, strips short words (1-2 characters), strips numbers, 
    strips http addresses, strips Unicode from emoji etc., lowercases everything, 
    strips extra spaces, punctuation, non-alphanumeric symbols. Also perform stemming

    input: 
        text: a string
    returns: 
        the preprocessed string.
    '''
    text = text.lower()
    text = preprocess.remove_stopwords(text) # remove stop words
    text = preprocess.strip_short(text) #get rid of short words
    text = preprocess.strip_numeric(text) #get rid of numbers
    p = re.compile(r'(http.*\s)|(http.*$)')
    text = p.sub('',text)
    p = re.compile(r'[^\x00-\x7F]+')
    text = p.sub('',text)
    text = preprocess.strip_multiple_whitespaces(text)
    text = preprocess.strip_punctuation(text)
    text = preprocess.strip_non_alphanum(text)
    text = preprocess.remove_stopwords(text)
    text = preprocess.strip_short(text)
# stemming
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    text = ' '.join(stemmed_words)

    return text

예제 #2

0

파일 보기

파일: npl_utils.py 프로젝트: CeMasChile/Twitter

def remove_non_plain(document):
    """
    Replaces urls, @usernames, #tags, emojis and numbers
    with a ' ' (space). Also removes accents and punctuation
    to finally remove redundant whitespace and lowercase all
    characters
    :param document: string
    :return: processed unicode string
    """
    document = to_unicode(document)
    document = non_plain_re.sub(' ', document)
    document = proc.strip_non_alphanum(document)
    document = proc.strip_numeric(document)
    document = proc.strip_multiple_whitespaces(document)
    document = deaccent(document)
    return document.lower()

예제 #3

0

파일 보기

파일: create_mag_grouped_testset.py 프로젝트: Soumyajain29/HybridCite

def clean_text(text):
    """ Cleans the text in the only argument in various steps 
    ARGUMENTS: text: content/title, string
    RETURNS: cleaned text, string"""
    # Replace newlines by space. We want only one doc vector.
    text = text.replace('\n', ' ').lower()
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Expand contractions: you're to you are and so on.
    text = contractions.fix(text)
    # Remove stop words
    text = preprocessing.remove_stopwords(text)
    # Remove punctuation -- all special characters
    text = preprocessing.strip_multiple_whitespaces(
        preprocessing.strip_non_alphanum(text))
    return text

예제 #4

0

파일 보기

def import_data(file, row_content, x):
    content_1 = open(file, 'r')
    csv_reader = csv.reader(content_1)
    content = []
    for row in csv_reader:
        row_new = remove_stopwords(row[row_content])
        row_new = strip_numeric(row_new)
        row_new = strip_non_alphanum(row_new)
        row_new = strip_short(row_new, minsize=3)
        content.append(row_new)
    length = len(content)
    label = []
    for i in range(0, length):
        label.append(x)

    return content, label

예제 #5

0

파일 보기

파일: notebook.py 프로젝트: SimoneGasperini/rboost

    def get_text(self):
        """
        Get the pre-processed text extracted from the Notebook '#TEXT' section


        Returns
        -------
        text : str
          Extracted text
        """

        lines = self.read().splitlines()
        raw_text = ' '.join(
            [line for line in lines[1:lines.index('#FIGURES')]])
        text = strip_non_alphanum(strip_punctuation(raw_text.lower()))

        return text

예제 #6

0

파일 보기

def clean_text(x: str) -> str:
    """
    :param x: raw string
    :return x: cleaned string
    """

    x = x.lower()
    x = re.sub('ssense|exclusive', '', x)

    x = strip_non_alphanum(x)
    x = strip_numeric(x)
    x = strip_short(x, minsize=2)
    x = remove_stopwords(x)
    x = strip_punctuation(x)
    x = strip_multiple_whitespaces(x)

    return x

예제 #7

0

파일 보기

파일: bot.py 프로젝트: riti121/cafe

def setname(namevalue):
    global name
    name = namevalue.lower()
    print(preprocess_string(name))
    try:
        preprocessed = [
            word for word in preprocess_string(name)
            if word not in ('people', 'call', 'friend', 'hey', 'hi', 'hei',
                            'cafe', 'buddi')
        ][0]
        name = [
            word for word in strip_non_alphanum(name.lower()).split()
            if preprocessed in word
        ][0]
    except:
        name = name.split()[0]
    name = name[0].upper() + name[1:]

예제 #8

0

파일 보기

파일: remark.py 프로젝트: SimoneGasperini/rboost

    def get_text(self):
        """
        Get the pre-processed text extracted from the Remark document


        Returns
        -------
        text : str
          Extracted text
        """

        with open(self.path + self.name, mode='r') as file:
            first_line, raw_text = file.readline(), file.read()

        text = strip_non_alphanum(strip_punctuation(raw_text.lower()))

        return text

예제 #9

0

파일 보기

def word_tokenize(text):
    try:
        if (isinstance(text, str)):
            words = text.lower().split()
        else:
            words = str(text).lower().split()

        if len(words) == 0:
            return ''
        text = ' '.join(words)
        text = preprocessing.strip_punctuation(text)
        text = preprocessing.strip_non_alphanum(text)
        text = preprocessing.strip_numeric(text)
        text = preprocessing.strip_tags(text)
        text = preprocessing.strip_multiple_whitespaces(text)
        return text.encode('utf-8')
    except UnicodeDecodeError as e:
        return ''

예제 #10

0

파일 보기

def telegrams():
    df = pd.DataFrame(columns=['index', 'lista'], )
    for j, cable in enumerate(cables_from_source(fname)):
        print("Gerando telegrama {}".format(j), end='\r')
        content = getattr(cable, 'content')
        content = content[content.find("1. "):len(content) - 1].lower()
        content = strip_short(content, minsize=3)
        content = strip_punctuation(content)
        content = strip_non_alphanum(content)
        content = remove_stopwords(content)
        content = lemmatization(content, ['NOUN'])
        df = df.append(
            {
                'lista': content,
                'index': j
            },
            ignore_index=True,
        )
    return df

예제 #11

0

파일 보기

파일: whoosh_make_query.py 프로젝트: ZPedroP/ASAPPpy

def query_indexer(query_string, directory, topN=30):
    '''
	query_string - sentence used to perform the search.
	directory - location of the indexer to be used.
	topN - number of documents returned by the query. The default is 30.
	'''
    ix = open_dir(directory)

    query_string = strip_non_alphanum(query_string)
    query_string = strip_multiple_whitespaces(query_string)

    with ix.searcher(weighting=scoring.BM25F) as searcher:
        # with ix.searcher(weighting=scoring.Frequency) as searcher:
        query = QueryParser("question",
                            ix.schema,
                            termclass=FuzzyTerm,
                            group=OrGroup).parse(query_string)
        try:
            options = []
            options_answers = []
            options_docnumbers = []

            loop_range = 0

            results = searcher.search(query, limit=topN, terms=True)

            if topN <= len(results):
                loop_range = topN
            else:
                loop_range = len(results)

            for i in range(loop_range):
                # this needs to be adapted in order to work with the Whoosh Chatbot; uncomment next line in order to work with the normal Chatbot
                options_answers.append(results[i]['response'])
                options.append(results[i]['question'])
                options_docnumbers.append(results[i].docnum)

            return options, options_answers, options_docnumbers

            # return the element with the highest similarity score from the indexer
            # return results[0]['response']
        except IndexError:
            return None

예제 #12

0

파일 보기

파일: text_parsing_utils.py 프로젝트: amerywu/textanalyzer1

def gensim_clean_string(textIn, _strip_tags=True, _split_alphanumeric=True, _strip_nonalphanumeric=True,
                        _strip_muliple_whitespace=True, _strip_short=True, _short_charcount_min=3,
                        _strip_punctuation=False, _convert_to_lower = False):
    cleaner = textIn
    if _strip_tags:
        cleaner = strip_tags(textIn)
    if _strip_nonalphanumeric:
        cleaner = strip_non_alphanum(cleaner)
    if _strip_muliple_whitespace:
        cleaner = strip_multiple_whitespaces(cleaner)
    if _split_alphanumeric:
        cleaner = split_alphanum(cleaner)
    if _strip_short:
        cleaner = strip_short(cleaner, minsize=_short_charcount_min)
    if _convert_to_lower:
        cleaner = cleaner.lower()


    return cleaner

예제 #13

0

파일 보기

파일: preprocessing.py 프로젝트: nunuthuan99/Email-Spam-Detection

def text_preprocess(bodyItem): # bodyItem: string (of one mail)  => return: list of words (of one mail)
    # Remove http, https
    bodyItem = re.sub(r'^https?:\/\/.*[\r\n]*', '', bodyItem, flags=re.MULTILINE)
    bodyItem = re.sub(r'^http?:\/\/.*[\r\n]*', '', bodyItem, flags=re.MULTILINE)
    bodyItem = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", bodyItem)
    # Decode some bodyItems which are not decoded
    bodyItem = bodyItem.replace("=", "%")
    bodyItem = urllib.parse.unquote(bodyItem)
    # Remove a word which has numbers and alphabets
    bodyItem = strip_number_alphabets(bodyItem)
    # Remove meaningless words, convert to lower words and split meaningful words 
    bodyItem = strip_non_alphanum(bodyItem).lower().strip()
    bodyItem = split_alphanum(bodyItem)
    # Join two words which have meaning in Vietnamese. Ex: hội thảo -> hội_thảo
    bodyItem = ViTokenizer.tokenize(bodyItem)
    # Remove a word which has one letter
    bodyItem = strip_short(bodyItem, minsize=2)
    # Remove stopwords
    words = [word for word in bodyItem.split() if word not in stopwordsVN_ENG.getStopwordsVN_ENG()]
    return words

예제 #14

0

파일 보기

def data_preprocessing(para):
    """
    This function takes in paragraph and returns a list pre-processed sentences

    Args : { para: raw paragraph }
    returns : { list of individual sentences in the paragraph }
    """
    # Splitting the paragraph into sentences
    sentences = sent_tokenize(para)
    processed_sentences = []
    for sent in sentences:
        # lowercase
        temp_text = sent.lower()

        # Converting sybols
        # temp_text = " ".join(symbol_conversion(sent))

        # Removing the non alphabetic symbols
        temp_text = strip_non_alphanum(sent)
        # Removing multiple white spaces
        temp_text = strip_multiple_whitespaces(temp_text)
        # Removing punctuations
        temp_text = strip_punctuation(temp_text)
        # Converting digits to alphabets
        temp_text = " ".join(replace_numbers(temp_text))

        # Remove stopword
        # temp_text = remove_stopwords(temp_text)

        # Remove short 1 letter values
        temp_text = strip_short(temp_text, minsize=2)

        # Lemmatization
        # doc = nlp(temp_text)
        # temp_text = " ".join([token.lemma_ for token in doc])

        if len(temp_text) > 1:
            processed_sentences.append(temp_text.lower())

    return processed_sentences

예제 #15

0

파일 보기

def read_corpus(name, max_len=20, test_size=5000):
    filepath = 'yle-corpus/data/'

    with open(os.path.join(filepath, name), 'r') as f:
        # remove label and url from text
        text = f.read()

    text = re.sub(r'__label__\S*\s', '', text)
    text = re.sub(r'\S?http\S+', '', text)

    text = strip_multiple_whitespaces(text)
    text = strip_non_alphanum(text)
    text = strip_punctuation(text)

    text = text.lower()
    text = text.split()
    # dcm = [w for w in text if len(w) < max_len + 4 and len(w) > max_len]
    text = [w for w in text if len(w) <= max_len]
    # ml = max([len(w) for w in text])
    train, test = train_test_split(text, test_size=test_size, shuffle=False)

    return train, test

예제 #16

0

파일 보기

파일: pdf.py 프로젝트: SimoneGasperini/rboost

    def get_text(self):
        """
        Get the pre-processed text extracted from the PDF document


        Returns
        -------
        text : str
          Extracted text (None if the extraction fails)
        """

        output_string = StringIO()

        try:
            with open(self.path + self.name, 'rb') as file:
                document = PDFDocument(PDFParser(file))
                resource_manager = PDFResourceManager()
                device = TextConverter(resource_manager,
                                       output_string,
                                       laparams=LAParams())
                interpreter = PDFPageInterpreter(resource_manager, device)

                print(f'>>> Reading document "{self.name}"')
                for page in tqdm(list(PDFPage.create_pages(document)),
                                 ncols=80):
                    interpreter.process_page(page)

        except UnicodeError:
            e = Exceptions(
                state='warning',
                message=f'The pdf file "{self.name}" cannot be read')
            e.throw()
            return

        text = output_string.getvalue()
        text = strip_non_alphanum(strip_punctuation(text.lower()))

        return text

예제 #17

0

파일 보기

def read_clean_inference_data(path_=None,
                              prefixes_to_clean=None,
                              app_mode=False,
                              df=None):
    """ Description
    :type path_: string
    :param path_: (optional) path to the file to read

    :type prefixes_to_clean: list
    :param prefixes_to_clean: list of column prefixes i.e just pass ["book", "author"] 
    even though the actual columns being ["book1","book2","author1","author2"].

    :type app_mode: bool
    :param app_mode: (optional) If true then path_ is not used and df should be passed

    :type df: pd.DataFrame
    :param df: pandas dataframe to clearn the dataframe with the prefixes mentioned.

    :type path_: string
    :param path_: (optional) path to the file to read
    
    :rtype: pd.DataFrame
    """
    if app_mode:
        df = df
    else:
        df = pd.read_csv(path_)

    cols = list()
    for i in prefixes_to_clean:
        cols.extend([j for j in df.columns if i in j])

    for i in cols:
        df[i] = df[i].apply(lambda x: (strip_multiple_whitespaces(
            strip_non_alphanum(x.lower().strip()))))
    return df

예제 #18

0

파일 보기

def word_tokenize_and_remove_stop_words(text, stop_word1, stop_word2):
    try:
        if isinstance(text, str):
            words = text.lower().split()
        else:
            words = str(text).lower().split()

        if len(words) == 0:
            return ''

        text = ' '.join(filter(lambda x: x not in stop_word1, words))
        text = preprocessing.strip_punctuation(text)
        text = preprocessing.strip_non_alphanum(text)
        text = preprocessing.strip_numeric(text)
        text = preprocessing.strip_tags(text)
        text = preprocessing.strip_multiple_whitespaces(text)
        words = text.split()
        if len(words) == 0:
            return ''

        text = ' '.join(filter(lambda x: x not in stop_word2, words))
        return text.encode('utf-8')
    except UnicodeDecodeError as e:
        return ''

예제 #19

0

파일 보기

파일: preprocess.py 프로젝트: colinsongf/SocializedWordEmbeddings

def PPL_preprocess(d_type, yelp_round):

    if d_type == 'dev':
        input_file = 'dev_rd%d.tmp' % (yelp_round)
        output_file = 'PPL_dev_rd%d.tmp' % (yelp_round)
    elif d_type == 'test':
        input_file = 'test_rd%d.tmp' % (yelp_round)
        output_file = 'PPL_test_rd%d.tmp' % (yelp_round)
    else:
        print 'No such dataset type: %s' % (d_type)
        return None

    command = 'java -jar Split_PPL.jar %s %s' % (input_file, output_file)
    print command
    os.system(command)

    if d_type == 'dev':
        input_file = 'PPL_dev_rd%d.tmp' % (yelp_round)
        output_file = 'PPL_dev_rd%d.tmp.tmp' % (yelp_round)
    elif d_type == 'test':
        input_file = 'PPL_test_rd%d.tmp' % (yelp_round)
        output_file = 'PPL_test_rd%d.tmp.tmp' % (yelp_round)
    else:
        print 'No such dataset type: %s' % (d_type)
        return None

    fin = open(input_file, 'rb')
    fo = open(output_file, 'wb')

    for s in fin:
        user_id = s.strip('\n').split()
        if len(user_id) <= 1:
            print "there is no word or only user_id in this line!"
            continue
        else:
            fo.write(user_id[0] + ' ')
            s = ''
            for i in range(len(user_id) - 1):
                s = s + user_id[i + 1] + ' '
            s = s[:-1]
            try:
                s = preprocessing.strip_punctuation(s)
                s = preprocessing.strip_non_alphanum(s)
                s = preprocessing.strip_numeric(s)
                s = preprocessing.strip_tags(s)
                s = preprocessing.strip_multiple_whitespaces(s)
                s_array = s.encode('utf8').split()

            except UnicodeDecodeError:
                fo.write('\n')
                continue

            s = ''
            actual_word_cnt = 0
            if len(s_array) > 0:
                for ss in s_array:
                    if ss == "RRB" or ss == "LRB" or ss == "LCB" or ss == "RCB":
                        continue
                    ss = ss.lower()
                    s = s + ss + ' '
                    actual_word_cnt = actual_word_cnt + 1
                if actual_word_cnt > 0:
                    fo.write(s[:-1])
            fo.write('\n')

    fin.close()
    fo.close()

    command = 'rm %s' % (input_file)
    #print command
    os.system(command)

    # select a sentence for each user
    dic = {}
    lower_bound = 8
    upper_bound = 10

    if d_type == 'dev':
        input_file = './PPL_dev_rd%d.tmp.tmp' % (yelp_round)
        output_file = './PPL_dev_rd%d.txt' % (yelp_round)
    elif d_type == 'test':
        input_file = './PPL_test_rd%d.tmp.tmp' % (yelp_round)
        output_file = './PPL_test_rd%d.txt' % (yelp_round)

    fo = open(output_file, "wb")
    user_count = 0

    user_file = 'user_file_rd%d.txt' % (yelp_round)
    with open(user_file, "rb") as fin:
        for line in fin:
            user_id = line.strip('\n')
            if user_id not in dic.keys():
                dic[user_id] = user_count
                user_count = user_count + 1
        total = user_count
    print "total %d user" % (total)
    recorder = [0 for i in range(total)]

    with open(input_file, "rb") as fin:
        for i, line in enumerate(fin):
            array_line = line.strip('\n').split()
            if array_line[0] == "unknown_user_id":
                pass
            else:
                if recorder[dic[array_line[0]]] != 0:
                    pass
                else:
                    if (len(array_line) >= (lower_bound + 1)
                            and len(array_line) <= (upper_bound + 1)):
                        fo.write(line.strip('\n'))
                        fo.write('\n')
                        recorder[dic[array_line[0]]] = 1

    go_on = 0
    count = 0
    for i in range(total):
        if recorder[i] == 0:
            go_on = 1
            count = count + 1

    if go_on == 1:
        with open(input_file, "rb") as fin:
            for i, line in enumerate(fin):
                array_line = line.strip('\n').split()
                if array_line[0] == "unknown_user_id":
                    pass
                else:
                    if recorder[dic[array_line[0]]] != 0:
                        pass
                    else:
                        if (len(array_line) >= (lower_bound + 1 - 1)
                                and len(array_line) <= (upper_bound + 1 + 1)):
                            fo.write(line.strip('\n'))
                            fo.write('\n')
                            recorder[dic[array_line[0]]] = 1

    go_on = 0
    count = 0
    for i in range(total):
        if recorder[i] == 0:
            go_on = 1
            count = count + 1

    if go_on == 1:
        with open(input_file, "rb") as fin:
            for i, line in enumerate(fin):
                array_line = line.strip('\n').split()
                if array_line[0] == "unknown_user_id":
                    pass
                else:
                    if recorder[dic[array_line[0]]] != 0:
                        pass
                    else:
                        if (len(array_line) >= (lower_bound + 1 - 2)
                                and len(array_line) <= (upper_bound + 1 + 2)):
                            fo.write(line.strip('\n'))
                            fo.write('\n')
                            recorder[dic[array_line[0]]] = 1

    go_on = 0
    count = 0
    for i in range(total):
        if recorder[i] == 0:
            go_on = 1
            count = count + 1

    if go_on == 1:
        with open(input_file, "rb") as fin:
            for i, line in enumerate(fin):
                array_line = line.strip('\n').split()
                if array_line[0] == "unknown_user_id":
                    pass
                else:
                    if recorder[dic[array_line[0]]] != 0:
                        pass
                    else:
                        if (len(array_line) >= (lower_bound + 1 - 3)
                                and len(array_line) <= (upper_bound + 1 + 3)):
                            fo.write(line.strip('\n'))
                            fo.write('\n')
                            recorder[dic[array_line[0]]] = 1

    go_on = 0
    count = 0
    for i in range(total):
        if recorder[i] == 0:
            go_on = 1
            count = count + 1

    if go_on == 1:
        with open(input_file, "rb") as fin:
            for i, line in enumerate(fin):
                array_line = line.strip('\n').split()
                if array_line[0] == "unknown_user_id":
                    pass
                else:
                    if recorder[dic[array_line[0]]] != 0:
                        pass
                    else:
                        fo.write(line.strip('\n'))
                        fo.write('\n')
                        recorder[dic[array_line[0]]] = 1

    go_on = 0
    count = 0
    for i in range(total):
        if recorder[i] == 0:
            go_on = 1
            count = count + 1
    if go_on == 1:
        print "ERROR"
    fo.close()

    command = 'rm %s' % (input_file)
    #print command
    os.system(command)

예제 #20

0

파일 보기

파일: test_parsing.py 프로젝트: yujuyeon0511/gensim

 def test_strip_non_alphanum(self):
     self.assertEqual(strip_non_alphanum("toto nf-kappa titi"),
                      "toto nf kappa titi")

예제 #21

0

파일 보기

파일: create_mag_file_cited_contexts.py 프로젝트: Soumyajain29/HybridCite

 ON abstracts.paperid=englishfields.paperid;
"""
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
second_cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

cur.execute(query)

for row in tqdm(cur):
    cur_paper_id = row.get('paperid')
    # we can now access the columns: row is a psycopg2.extras.RealDictRow (inherited from dict)
    # print(row.keys()): dict_keys(['paperid', 'papertitle', 'abstract'])
    # IMPORTANT: EXPERIMENTAL: Get the contexts from the papers which cite the current paper
    contexts_query = """ 
      SELECT paperreferenceid, string_agg(citationcontext, ' ||--|| ') AS contexts
      FROM papercitationcontexts
      WHERE paperreferenceid=%s GROUP BY paperreferenceid; """
    second_cur.execute(contexts_query, (cur_paper_id, ))
    second_results = second_cur.fetchone()
    if not second_results:
        # second_results returned None, this paper has not been cited in any citation context
        continue
    contexts = second_results['contexts']
    #contexts = contexts.split(' ||--|| ')
    contexts = preprocessing.strip_multiple_whitespaces(
        preprocessing.strip_non_alphanum(contexts))
    #print(contexts)
    combined_text = '{} {} {} {}\n'.format(cur_paper_id, row['papertitle'],
                                           row['abstract'], contexts)
    file.write(combined_text)

file.close()

예제 #22

0

파일 보기

파일: election_preprocessing.py 프로젝트: tjpajala/wikivaalit

stoplist = set(stopwords.iloc[:, 0].unique())
if stop_cities:
    cities = [
        'espoo', 'helsinki', 'turku', 'tampere', 'jyväskylä', 'kuopio', 'oulu',
        'espoon', 'helsingin', 'turun', 'tampereen', 'jyväskylän', 'kuopion',
        'oulun', 'kouvola', 'kouvolan', 'vaasa', 'vaasan', 'lahti', 'lahden',
        'kauhava', 'kauhavan', 'salo', 'salon', 'turussa', 'helsingissä',
        'espoossa', 'joensuun', 'kotkan', 'keravan', 'hämeenlinnan',
        'joensuun', 'mikkelin', 'vantaan', 'vihdin'
    ]
    for city in cities:
        stoplist.add(city)

texts = [[
    word for word in strip_short(strip_multiple_whitespaces(
        strip_numeric(strip_non_alphanum(document.lower()))),
                                 minsize=3).split()
    if word not in stoplist and word != []
] for document in documents]
if len(df) != len(texts):
    print("wrong lengths for df and texts!")
if len(texts) != len(documents):
    print("wrong lengths for texts and documents!")

#
# # remove words that appear only once
# frequency = defaultdict(int)
# for text in texts:
#     for token in text:
#         frequency[token] += 1
#

예제 #23

0

파일 보기

파일: test_parsing.py 프로젝트: RaRe-Technologies/gensim

 def testStripNonAlphanum(self):
     self.assertEqual(strip_non_alphanum("toto nf-kappa titi"), "toto nf kappa titi")

예제 #24

0

파일 보기

파일: doc2vec_train.py 프로젝트: UBERCRUZER/twitch-mod

#corpus



file_dir = os.path.join('C:\\', 'Users', 'cruze', 'Documents', 'CS664')
inputfile = os.path.join(file_dir, 'train_E6oV3lV.csv')
df = pd.read_csv(inputfile)

corpus = df['tweet']

df = []
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_non_alphanum, strip_numeric, strip_multiple_whitespaces, stem
for msg in corpus:
    string = remove_stopwords(msg)
    string = strip_punctuation(string)
    string = strip_non_alphanum(string)
    string = strip_numeric(string)
    string = strip_multiple_whitespaces(string)
    string = stem(string)
    df.append(string)

corpus = df

#out = pd.DataFrame(data=corpus)
#out.to_csv('chatOut.csv', index_label=False)

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(corpus)]

max_epochs = 50
vec_size = 50
alpha = 0.025

예제 #25

0

파일 보기

파일: bot.py 프로젝트: vaibhavi1220/Depression-Therapist-Chatbot

            family()
        else:
            work()


print(
    '\n\nHi! A debt of gratitude is in order for coming here. I am a chatbot. Individuals say that'
    'I am a kind and receptive bot.')
name = input('If its not too much trouble let me know your name.\n')
try:
    preprocessed = [
        word for word in preprocess_string(name)
        if word not in ('people', 'call', 'friend')
    ][0]
    name = [
        word for word in strip_non_alphanum(name.lower()).split()
        if preprocessed in word
    ][0]
except:
    name = name.split()[0]
name = name[0].upper() + name[1:]
print("Hi " + name +
      "! My name's SYVBot. How about we begin with our session.")
reply = input("Hows life?\n")
if (predict(reply) >= 0.55):
    reply = input('That is great. Are you as a rule this glad, or are there '\
                     'a few stresses that you need to talk about?\n')
    if (predict(reply) >= 0.7):
        reply = input('You appear to be extremely content. Wanna sign off?\n')
        if (predict(reply) >= 0.7):
            print('Ok, bye ' + name + '!')

예제 #26

0

파일 보기

파일: embedding_fasttext.py 프로젝트: grv1207/Exploring-Diachronic-Changes-Medical-Knowledge

def getline(filepath):
    with open(filepath, 'r') as fout:
        for line in fout:
            yield (strip_non_alphanum(line).split())

예제 #27

0

파일 보기

 def get_processed_stems(self):
     return prep.stem(
         prep.remove_stopwords(prep.strip_non_alphanum(self.text))).split()

예제 #28

0

파일 보기

def clear_unknown_letter(text):
    text = strip_non_alphanum(text)
    text = word_tokenize(text)
    return process_lower(text)

예제 #29

0

파일 보기

파일: preprocess.py 프로젝트: colinsongf/SocializedWordEmbeddings

def SVM_preprocess(d_type, yelp_round):
    # preprocessing for sentiment classification using SVM
    # remove punctuation, tags, multiple spaces, tags, stop words, convert all words into lower case.
    if d_type == 'train':
        input_file = 'train_rd%d.tmp' % (yelp_round)
        output_file = './SVM_train_rd%d.txt' % (yelp_round)
    elif d_type == 'dev':
        input_file = 'dev_rd%d.tmp' % (yelp_round)
        output_file = './SVM_dev_rd%d.txt' % (yelp_round)
    elif d_type == 'test':
        input_file = 'test_rd%d.tmp' % (yelp_round)
        output_file = './SVM_test_rd%d.txt' % (yelp_round)
    else:
        print 'No such dataset type: %s' % (d_type)
        return None

    stop_file = 'english_stop.txt'

    with open(stop_file, "rb") as f:
        for i, l in enumerate(f):
            pass
        total = i + 1

    fin = open(input_file, "rb")
    fo = open(output_file, "wb")

    stop_word1 = ["" for i in range(total)]
    stop_word2 = ["" for i in range(total)]
    cnt1 = 0
    cnt2 = 0

    with open(stop_file, "rb") as fs:
        for l in fs:
            s = l.strip('\n')
            if "'" in s:
                stop_word1[cnt1] = s
                cnt1 = cnt1 + 1
            else:
                stop_word2[cnt2] = s
                cnt2 = cnt2 + 1

    user_flag = 0
    start = 1
    begin_mark = str('@@@@@begin_mark@@@@@\n')
    for s in fin:
        if s == begin_mark:
            user_flag = 1
            continue
        if user_flag == 1:
            user_flag = 0
            if start != 1:
                fo.write('\n')
            else:
                start = 0
            user_id = s.strip('\n').split()
            if len(user_id) < 2:
                print "there is no user_id & star rating following the start_mark!"
            fo.write(user_id[0] + ' ' + user_id[1] + ' ')
            s = ''
            if len(user_id) <= 2:
                continue
            else:
                for i in range(len(user_id) - 2):
                    s = s + user_id[i + 2] + ' '
                #s = s[:-1]
        try:
            s_array = s.encode('utf8').split()
            s = ''
            if len(s_array) > 0:
                for ss in s_array:
                    ss = ss.lower()
                    if ss not in stop_word1:
                        s = s + ss + ' '
            else:
                continue
            s = s.strip('\n')
            if len(s) > 0:
                s = preprocessing.strip_punctuation(s)
                s = preprocessing.strip_non_alphanum(s)
                s = preprocessing.strip_numeric(s)
                s = preprocessing.strip_tags(s)
                s = preprocessing.strip_multiple_whitespaces(s)
                s_array = s.encode('utf8').split()
                s = ''
                if len(s_array) > 0:
                    for ss in s_array:
                        if ss not in stop_word2:
                            s = s + ss + ' '
                else:
                    continue
            else:
                continue
            if len(s) > 0:
                if s[-1] != ' ':
                    s = s + ' '
            else:
                continue
            fo.write(s)
        except UnicodeDecodeError:
            continue

    fin.close()
    fo.close()

예제 #30

0

파일 보기

파일: preprocess.py 프로젝트: colinsongf/SocializedWordEmbeddings

def Train_preprocess(yelp_round):

    input_file = 'train_rd%d.tmp' % (yelp_round)
    output_file = './swe_train_rd%d.txt' % (yelp_round)

    fin = open(input_file, 'rb')
    fo = open(output_file, 'wb')

    user_flag = 0
    start = 1
    begin_mark = str('@@@@@begin_mark@@@@@\n')
    for s in fin:
        if s == begin_mark:
            user_flag = 1
            continue
        if user_flag == 1:
            user_flag = 0
            if start != 1:
                fo.write('\n')
            else:
                start = 0
            user_id = s.strip('\n').split()
            if len(user_id) < 1:
                print "there is no user_id following the start_mark!"
            fo.write(user_id[0] + ' ')
            s = ''
            if len(user_id) <= 1:
                continue
            else:
                for i in range(len(user_id) - 1):
                    s = s + user_id[i + 1] + ' '
        try:
            s = s.strip('\n')
            s = preprocessing.strip_punctuation(s)
            s = preprocessing.strip_non_alphanum(s)
            s = preprocessing.strip_numeric(s)
            s = preprocessing.strip_tags(s)
            s = preprocessing.strip_multiple_whitespaces(s)
            s_array = s.encode('utf8').split()
        except UnicodeDecodeError:
            continue
        s = ''
        actual_word_cnt = 0
        for ss in s_array:
            ss = ss.lower()
            actual_word_cnt = actual_word_cnt + 1
            s = s + ss + ' '
        if (actual_word_cnt > 0):
            fo.write(s[:-1])
        else:
            continue

    fin.close()
    fo.close()

    # get user_file and train_file
    if os.path.isfile('./get_user_train_file') == False:
        command = 'gcc get_user_file_w2v_train.c -o get_user_file_w2v_train -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result'
        print command
        os.system(command)

    user_file = 'user_file_rd%d.txt' % (yelp_round)
    w2v_train = './w2v_train_rd%d.txt' % (yelp_round)
    command = './get_user_file_w2v_train -input %s -user %s -word %s' % (
        output_file, user_file, w2v_train)
    print command
    os.system(command)

예제 #31

0

파일 보기

파일: preprocess.py 프로젝트: colinsongf/SocializedWordEmbeddings

def NN_preprocess(d_type, yelp_round):
    # preprocessing for sentiment classification using Deep Neural Network
    if d_type == 'train':
        input_file = 'train_rd%d.tmp' % (yelp_round)
        output_file = './NN_train_rd%d.tmp' % (yelp_round)
    elif d_type == 'dev':
        input_file = 'dev_rd%d.tmp' % (yelp_round)
        output_file = './NN_dev_rd%d.tmp' % (yelp_round)
    elif d_type == 'test':
        input_file = 'test_rd%d.tmp' % (yelp_round)
        output_file = './NN_test_rd%d.tmp' % (yelp_round)
    else:
        print 'No such dataset type: %s' % (d_type)
        return None
    command = 'java -jar Split_NN.jar %s %s' % (input_file, output_file)
    print command
    os.system(command)

    # remove stop words
    if d_type == 'train':
        input_file = './NN_train_rd%d.tmp' % (yelp_round)
        output_file = './NN_train_rd%d.txt' % (yelp_round)
    elif d_type == 'dev':
        input_file = './NN_dev_rd%d.tmp' % (yelp_round)
        output_file = './NN_dev_rd%d.txt' % (yelp_round)
    elif d_type == 'test':
        input_file = './NN_test_rd%d.tmp' % (yelp_round)
        output_file = './NN_test_rd%d.txt' % (yelp_round)
    else:
        print 'No such dataset type: %s' % (d_type)
        return None

    stop_file = 'english_stop.txt'

    fin = open(input_file, 'rb')
    fs = open(stop_file, "rb")
    tar_file = open(output_file, 'w+')

    with open(stop_file, "rb") as f:
        for i, l in enumerate(f):
            pass
        total = i + 1

    stop_word1 = ["" for i in range(total)]
    stop_word2 = ["" for i in range(total)]
    cnt1 = 0
    cnt2 = 0
    for l in fs:
        s = l.strip('\n')
        if "'" in s:
            stop_word1[cnt1] = s
            cnt1 = cnt1 + 1
        else:
            stop_word2[cnt2] = s
            cnt2 = cnt2 + 1

    user_flag = 0
    review_flag = 0
    start = 1
    begin_mark = str('@@@@@begin_mark@@@@@\n')
    for s in fin:
        if s == begin_mark:
            user_flag = 1
            continue
        if user_flag == 1:
            user_flag = 0
            if start != 1:
                tar_file.write('\n')
            else:
                start = 0
            user_star = s.strip('\n').split()
            if (len(user_star) < 2):
                print "there is no user_id & star rating following the start_mark!"
                print len(user_star)
                for i in range(len(user_star)):
                    print user_star[i]
            tar_file.write(user_star[0] + '\t\t')
            tar_file.write(user_star[1] + '\t\t')
            continue
        try:
            s_array = s.encode('utf8').split()
            s = ''
            if len(s_array) > 0:
                for ss in s_array:
                    ss = ss.lower()
                    if ss not in stop_word1:
                        s = s + ss + ' '
            else:
                continue
            s = s.strip('\n')
            s = preprocessing.strip_punctuation(s)
            s = preprocessing.strip_non_alphanum(s)
            s = preprocessing.strip_numeric(s)
            s = preprocessing.strip_tags(s)
            s = preprocessing.strip_multiple_whitespaces(s)
            s_array = s.encode('utf8').split()
            s = ''
            actual_word_cnt = 0
            if len(s_array) > 0:
                for ss in s_array:
                    if ss == "RRB" or ss == "LRB" or ss == "LCB" or ss == "RCB":  # -LCB-, -LRB-, -RCB-, -RRB-
                        continue
                    if ss not in stop_word2:
                        s = s + ss + ' '
                        actual_word_cnt = actual_word_cnt + 1
                if (actual_word_cnt > 0):
                    tar_file.write(s[:-1])
                    tar_file.write('#')
            else:
                continue
        except UnicodeDecodeError:
            continue
    fin.close()
    tar_file.close()

    command = 'rm %s' % (input_file)
    #print command
    os.system(command)

예제 #32

0

파일 보기

 def get_processed_text(self):
     return prep.remove_stopwords(prep.strip_non_alphanum(self.text))