Python preprocess_string示例，gensim.parsing.preprocessing.preprocess_string Python示例

示例#1

0

显示文件

文件： script_learning_concept.py 项目： ykrmm/semantic_deep_neuralIR

def load_all_query_annotated_robust4(
        file='/local/karmim/Stage_M1_RI/data/topics-title.annotated.csv',
        pre_process=True,
        CUSTOM_FILTERS=[lambda x: x.lower(), remove_stopwords],
        delete_meaning=True):
    query_an = {}  # Dict with words and concept for a query id
    concept = {}  # Dict with only the concept for a query id
    f = codecs.open(file, 'r', encoding='utf-8', errors='ignore')
    for line in f:
        #print(line.split())
        line = np.array(line.split())
        index = np.where(np.char.find(line, '$#') >= 0)
        concept[line[0]] = list(line[index])
        query_an[line[0]] = list(line[1:])

        if delete_meaning:
            concept[line[0]] = [w[:-5] for w in concept[line[0]]]
            query_an[line[0]] = [
                w[:-5] if '$#' in w else w for w in query_an[line[0]]
            ]
        if pre_process:
            for k in query_an:
                query_an[k] = preprocess_string(' '.join(query_an[k]),
                                                CUSTOM_FILTERS)
                concept[k] = preprocess_string(' '.join(concept[k]),
                                               [lambda x: x.lower()])

    return query_an, concept

示例#2

0

显示文件

文件： test_lee.py 项目： sunnyweilai/Finding-Theme-Color-Palettes

    def setUp(self):
        """setup lee test corpora"""
        global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2

        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
        bg_corpus_file = 'lee_background.cor'
        corpus_file = 'lee.cor'
        sim_file = 'similarities0-1.txt'

        # read in the corpora
        latin1 = partial(utils.to_unicode, encoding='latin1')
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus2 = [
                preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1])
                for s in f
            ]
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus2 = [
                preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1])
                for s in f
            ]

        # read the human similarity data
        sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
        sim_m_size = np.shape(sim_matrix)[0]
        human_sim_vector = sim_matrix[np.triu_indices(sim_m_size, 1)]

示例#3

0

显示文件

文件： load_and_preprocess.py 项目： hellodannyliu/topic-traceability

def parse_and_load_discussion_questions(course_data_path, conn,
                                        course_zip_name):
    """load, parse, process discussion questions
    """
    course_slug = course_zip_name.replace("_", "-")
    sql_select_discussion_question = (
        "SELECT discussion_question_id, discussion_question_title, " +
        "discussion_question_details " +
        "FROM discussion_questions, courses WHERE " +
        "discussion_questions.course_id == courses.course_id AND " +
        "courses.course_slug == (?)")

    c = conn.cursor()
    c.execute(sql_select_discussion_question, (course_slug, ))

    course_questions = {}

    rows = c.fetchmany()
    while rows:
        for row in rows:
            question_id, question_title, question_details = row
            course_questions[question_id] = (
                preprocess_string(question_title) +
                preprocess_string(question_details))
        rows = c.fetchmany()

    # save the course_questions to disk
    questions_filepath = os.path.join(course_data_path, "..",
                                      "questions.{}.json".format(course_slug))
    with open(questions_filepath, "w") as questions_file:
        json.dump(course_questions, questions_file)

示例#4

0

显示文件

def main(args):
    doc_dir = args.doc_dir
    docs = defaultdict(list)
    for file in os.listdir(doc_dir):
        title, abstract = parse_xml(os.path.join(doc_dir, file))
        key = int(re.search(r'\d+', file).group())
        title = preprocess_string(title)
        abstract = preprocess_string(abstract)
        docs[key] = [title, abstract]

    train_word2vec(docs, args.model_file)

示例#5

0

显示文件

文件： preprocessing.py 项目： skandavaidyanath/Recipe-Search-Engine

def preprocess(s, stem=True):
    '''
    given a document or query string, returns a list of preprocessed words.
    we can decide whether to stem each word or not.
    '''
    if not stem:
        preprocess_filters = DEFAULT_FILTERS.copy()
        preprocess_filters.pop()  # remove stemming from list of filters
        wordList = preprocess_string(s, filters=preprocess_filters)
    else:
        wordList = preprocess_string(s)
    for i in range(len(wordList)):
        wordList[i] = deaccent(wordList[i])
    return wordList

示例#6

0

显示文件

    def preprocess_column(self, pd_data, load_model=False):
        """
        Preprocess specified column.

        Inputs:
            pd_data: (pd.Series) Input data to preprocess.

        Returns:
            pd_data: (pd.Series) Preprocess data.
        """
        # preprocess using set of filters
        custom_filters = self._build_custom_filter_list()

        log.info('Applying preprocess filters to the %s...', pd_data.name)
        pd_data = pd_data.apply(
            lambda x: gpp.preprocess_string(x, custom_filters),
            convert_dtype=False)

        # generate phrase based on the configuration
        pd_data = self._generate_phrase(pd_data, load_model=load_model)

        # join the list of words into space delimited string
        pd_data = pd_data.apply(lambda x: ' '.join(x))

        return pd_data

示例#7

0

显示文件

文件： clustering.py 项目： EMBEDDIA/texta-rest

    def _tokenize(document: dict, phraser=None):
        text_information = [value for key, value in document.items()]
        text = " ".join(text_information)


        def _custom_strip_short(s):
            return strip_short(s, minsize=2)


        def _custom_strip_numeric(s):
            RE_NUMERIC = re.compile(r' [0-9]+( [0-9]+)*(\.)? ', re.UNICODE)
            s = utils.to_unicode(s)
            return RE_NUMERIC.sub(" ", s)


        # most of the preprocessing is done already
        # strip_tags removes style definitions etc as well which is good
        CUSTOM_FILTERS = [strip_tags, _custom_strip_short, _custom_strip_numeric]
        preprocessed_text = preprocess_string(text, CUSTOM_FILTERS)

        if phraser:
            tokens = phraser.phrase(preprocessed_text)
            return [token.replace(' ', '_') for token in tokens]
        else:
            return preprocessed_text

示例#8

0

显示文件

def find_nearest_words(model, work):
    # tags = parse_tags()
    # tax = parse_taxonomy()
    # ids = tags[work]['ids']
    # ts = [tax[d] for d in ids if 'stemning' in tax[d]]

    text = MAPPER.get_text(work)
    # print(text)
    # print(ts)

    M, labels = infer_taxonomy_vectors(model, taxonomy_path=None)
    max_dists = defaultdict(lambda: 0)

    for token in pre.preprocess_string(text, filters=FILTERS):
        if token in model.wv:
            v = model.wv[token].reshape(1, -1)
            dists = cosine_similarity(M, v)
            am = np.argmax(dists)
            max_dists[labels[am]] = max(max_dists[labels[am]], np.max(dists))

    md = sorted([(k, d) for k, d in max_dists.items()], key=lambda x: x[1])
    i = 0
    rtags = []
    for k, v in md:
        if i > 10:
            break
        if 'stemning::' in k and v > 0.5:
            i += 1

            rtags.append(k)
            # print(k, v)
    return rtags

示例#9

0

显示文件

文件： datasets.py 项目： ismaelbonneau/semantic_deep_neuralIR

def custom_tokenizer(s):
    return [
        w.translate(table) for w in preprocess_string(s, [
            strip_tags, lambda x: strip_short(x, 2), remove_stopwords,
            lambda x: ks.stem(x)
        ])
    ]

示例#10

0

显示文件

    def remove(self, text):
        """[summary]

        Args:
            text ([type]): [description]

        Returns:
            [type]: [description]
        """
        # if text:
        CUSTOM_FILTERS = [
            lambda x: x.lower(),  # lowercase
            strip_multiple_whitespaces,
            strip_non_alphanum,
            strip_numeric,
            remove_stopwords,
            strip_short,
            #                   stem_text
        ]
        text = text.lower()
        example_sent = preprocess_string(text, CUSTOM_FILTERS)
        filtered_sentence = [
            w for w in example_sent if not w in self.get_stopwords()
        ]

        return filtered_sentence

示例#11

0

显示文件

def litsiden():
    f = pr.resource_filename('matext', 'data/litteratursiden_pidmap.json')
    mapper = LitteraturSidenMapper()
    works = []
    df = {}

    lk_frac = None
    with open(f) as fh:
        data = json.load(fh)
        lk_frac = [
            len(data['train']) + len(data['dev']),
            len(data['train']) + len(data['dev']) + len(data['corpus'])
        ]

        for k in data.keys():
            works += data[k]
    for w in works:
        t = mapper.get_text(w)
        tokens = pre.preprocess_string(t, filters=FILTERS)

        df[w] = [len(tokens), len(t)]

    df = pd.DataFrame.from_dict(df, orient='index')
    df.columns = ['tokens', 'char-length']
    print(df.describe())
    fig, ax = plt.subplots()
    sns.distplot(df['char-length'],
                 ax=ax,
                 bins=100,
                 kde=False,
                 norm_hist=False)
    fig.savefig('lit_hist.png')
    print('LK', lk_frac)

示例#12

0

显示文件

文件： PlagiarismDetection.py 项目： JacobBundgaardKnudsen/Plagiarism

def preprocess(datafolder):
    docs = {}
    nlp = load('en')

    for file in os.listdir(
            datafolder):  #going through all the files in the folder
        filepath = os.path.join(datafolder, file)
        if not file.startswith('.'):
            document = loadDoc(filepath)
            sentenceSplit = list(nlp(document).sents)
            gensimSettings = [
                lambda x: x.lower(),
                genPreProc.remove_stopwords,
                genPreProc.
                stem,  #making the text uniform and removing stopwords
                genPreProc.strip_non_alphanum,
                genPreProc.strip_multiple_whitespaces
            ]
            sentencePreprocess = [
                ' '.join(
                    preprocess_string(str(sentence), filters=gensimSettings))
                for sentence in sentenceSplit
            ]

            docs[os.path.basename(filepath)] = sentencePreprocess
    return docs

示例#13

0

显示文件

def process_input(row):
    input_merged = row['Assignment Name'] + ' ' + row['School Category']
    
    # gensim's preprocess_string through series of txt_filters which generates tokens array
    input_processed_tokens = " ".join(preprocess_string(input_merged, txt_filters))
    
    return input_processed_tokens

示例#14

0

显示文件

def remove_text_duplicates_retain_order(texts):
	"""
	Remove the duplicates from a list of text strings, where duplicates are defined as two text strings
	that differ only by puncutation, capitalization, or the length of whitespaces. This is useful for 
	not retaining extra text information just because its not perfectly identical to some existing string.
	Duplicates are removed such that the first occurence is retained, and that determines the final
	ordering. The texts that are returned are not processed, and are a subset of the original list of 
	text strings. The strings retained determined which version of that duplicate in terms of punctuation,
	capitalization, and whitespace is retained in the final list.
	
	Args:
	    texts (list of str): A list of arbitrary strings.
	
	Returns:
	    list of str: A subset of the original list, with duplicates as defined above removed.
	"""
	# Create a list of cleaned texts that corresponds to the list of texts passed in.
	filters = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces]
	cleaned_texts = [" ".join(preprocess_string(text, filters)) for text in texts]
	assert len(texts) == len(cleaned_texts)


	# Get a dictionary mapping the cleaned texts to the a list of the original texts that they resulted from.
	cleaned_to_originals = defaultdict(list)
	for cleaned_text,text in zip(cleaned_texts,texts):
		cleaned_to_originals[cleaned_text].append(text)

	# Remove duplicates and retain the order of the list of the cleaned texts.
	cleaned_texts_no_duplicates = remove_duplicates_retain_order(cleaned_texts)

	# Using whatever the first observed instance of original text that resulting in each cleaned text, rebuild the list.
	original_texts_with_same_removals = [cleaned_to_originals[cleaned_text][0] for cleaned_text in cleaned_texts_no_duplicates]
	return(original_texts_with_same_removals)

示例#15

0

显示文件

 def transform(self, df_x):
     return np.asmatrix(
         np.array([
             self._model.infer_vector(
                 preprocess_string(row['reviews_content']))
             for index, row in df_x.iterrows()
         ]))

示例#16

0

显示文件

文件： generate_pickles.py 项目： FeryET/FinalCognitiveProgram

def read_documents(path):
    dataset = []
    filter = [
        lambda x: x.lower(),
        strip_multiple_whitespaces,
        strip_numeric,
        strip_non_alphanum,
        strip_punctuation,
        remove_stopwords,
        strip_tags,
        lambda s: strip_short(s, minsize=4),
    ]
    LEN_THRESHOLD = 10
    for root, dirs, files in os.walk(path):
        if os.path.basename(root) in ["Cog", "NotCog"]:
            print(root)
            for f in files:
                with open(os.path.join(root, f), "r") as myfile:
                    text = myfile.read()
                    text = re.sub(r"[^\x00-\x7F]+", " ", text)
                    res = []
                    doc = nlp(text)
                    for sent in doc.sents:
                        sent = " ".join([word.lemma_ for word in sent])
                        res.append(" ".join(
                            preprocess_string(sent, filters=filter)))
                    text = "\n".join(res)
                    label = 0 if os.path.basename(root) == "Cog" else 1
                    if len(text) < LEN_THRESHOLD:
                        print(f)
                        continue
                    dataset.append((text, label))
    random.shuffle(dataset)
    texts, labels = zip(*dataset)
    return texts, labels

示例#17

0

显示文件

def cleaner(path, d):
    with codecs.open(path, encoding='utf8') as f:
        text = f.read()
        text1 = unidecode.unidecode(text)

        # remove roman numerals
        text1 = removeRomanNumerals(text1)

        # strip numbers, whitespace, and punctuation
        EMBEDDING_FILTERS = [
            lambda x: x.lower(), strip_numeric, strip_multiple_whitespaces,
            strip_punctuation
        ]
        c = preprocess_string(text1, EMBEDDING_FILTERS)

        # replace and correct words
        c = replaceWordsFromMap(c, correctionDict)
        c = replaceWordsFromMap(c, syncopateDict)
        c = replaceWordsFromMap(c, variantDict)
        c = replaceWordsFromMap(c, variantDict2)

        d = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in c]

        t = " ".join(d)

        tokens = parallelRemove(d, 4)

        return tokens, t

示例#18

0

显示文件

def flat_doc(document, model, extremes=None):
	flat_doc = ""
	for field in document:
		if not isinstance(document[field], list): continue #No tomamos en cuenta los campos 'id' y '_version_': auto-generados por Solr
		for value in document[field]:
			## Detección y traducción ##
			if field=='author.authors.authorName' or field=='author.authorBio' or field=='description' or field=='quotes.quoteText':
				value_blob = TextBlob(value)
				try:
					if value_blob.detect_language() != 'en':
						try: 
							value = value_blob.translate(to='en')
						except Exception as e: 
							value = value #e = NotTranslated('Translation API returned the input string unchanged.',)
				except Exception as e:
					value = value #e = TranslatorError('Must provide a string with at least 3 characters.')
			############################
			flat_doc += str(value)+' ' #Se aplana el documento en un solo string
	flat_doc = preprocess_string(flat_doc, CUSTOM_FILTERS) #Preprocesa el string
	flat_doc = [w for w in flat_doc if w not in stop_words] #Remueve stop words
	if extremes:
		flat_doc = [w for w in flat_doc if w not in extremes]
	flat_doc = [w for w in flat_doc if w in model.vocab] #Deja sólo palabras del vocabulario
	if flat_doc == []:
		flat_doc = ['book'] #Si el libro queda vacío, agregarle un token para no tener problemas más adelante
	return flat_doc

示例#19

0

显示文件

文件： text_embedding.py 项目： ZivaUrbancic/enviroLENS-services

    def tokenize(self, text):
        """Tokenizes the provided text

        Args:
            text (str): The text to be tokenized

        Returns:
            list(tuple(str, int)): A list of (token, count) pairs from the text without the stopwords.

        """

        # make everything lowercase and strip punctuation
        CUSTOM_FILTERS = [lambda x: x.lower(), strip_punctuation]
        tokens = preprocess_string(text, CUSTOM_FILTERS)

        # filter out all stopwords
        filtered_tokens = [w for w in tokens if not w in self.__stopwords]

        # count the term frequency in the text
        count = { }
        for word in filtered_tokens:
            if word not in count:
                count[word] = 0
            count[word] += 1

        # sort the terms in descending order
        terms_sorted = sorted(count.items(), key=operator.itemgetter(1), reverse=True)
        return terms_sorted

示例#20

0

显示文件

def create_bigrams_and_remove_Stopwords(raw_transcripts_gensim):
    #creating phrases like good_afternoon, hdfc_life so that they can be removed as part of custom stop words.
    Preprocessed_Transcripts = []
    bigrams = []
    import gensim, pprint
    for transcripts in raw_transcripts_gensim["Lemmatized_transcript"]:
        tokens = [list(gensim.utils.tokenize(transcripts, lower=True))]

        bigram_mdl = gensim.models.phrases.Phrases(tokens,
                                                   min_count=1,
                                                   threshold=5)
        #Preprocessed_Transcripts.append(token_bigrams(transcripts))
        from gensim.parsing.preprocessing import preprocess_string, remove_stopwords
        CUSTOM_FILTERS = [remove_stopwords]
        #if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3
        tokens = [
            preprocess_string(" ".join(word), CUSTOM_FILTERS)
            for word in tokens
        ]
        bigrams = bigram_mdl[tokens]
        Preprocessed_Transcripts.append(list(bigrams))

    #final step of preprocessing to remove stop words
    Final_Preprocessed_Transcripts = []
    for tokenised_text in Preprocessed_Transcripts:
        #print(tokenised_text)
        for token in tokenised_text:
            Final_Preprocessed_Transcripts.append(preprocess(token))

    #Appending the final preprocessed transcript to the dataframe
    raw_transcripts_gensim[
        "Preprocessed Transcripts"] = Final_Preprocessed_Transcripts

    return raw_transcripts_gensim

示例#21

0

显示文件

文件： GH_sim_1.py 项目： norberte/Mining-StackOverflow-Github

def getExistingWordsFromModel(words):
    """ Checks if a list of words are in the dictionary of the word2vec model """
    CUSTOM_FILTERS = [lambda x: strip_numeric, remove_stopwords]
    res = []
    for w in words:
        try:
            vec = word_vectors[w]
            res.append(w)
        except:
            try:
                w_transformed = w.replace(".", "").replace("=", "").replace(
                    "-", "").replace("*", "").replace("'", "").replace(
                        "`", "").replace("|", "").replace('\\', "").replace(
                            "/", "").replace("$", "").replace("^", "").replace(
                                "&", "").replace("@", "").replace("%", "")
                vec = word_vectors[w_transformed]
                res.append(w_transformed)
            except:
                try:
                    w_stripped = preprocess_string(w_transformed,
                                                   CUSTOM_FILTERS)
                    vec = word_vectors[w_stripped]
                    res.append(w_stripped)
                except:
                    continue
    return res

示例#22

0

显示文件

 def __iter__(self):
     with open(self.filename) as file:
         for line in file:
             line = preprocess_string(line, [
                 lambda x: x.lower(), strip_tags, strip_multiple_whitespaces
             ])
             yield ['<SOS>', *line, '<EOS>']

示例#23

0

显示文件

def cleanlines2(text):
    '''Clean text removing urls, punctation, numbers, whitespace and convert
    to lowecase'''
    text1 = str(text).lower()

    lines = []   #split in lines
    for line in text1.split('\n'):
        line = str(line)
        line = line.strip('\n')
        if line:
            lines.append(line)
    cleantext = ''
    for line in lines:
        filterreg = config.LABELREGEX.search(line)
        if filterreg is None:
            cleantext = cleantext + line #+ '\n'
        else:
            if filterreg.group():
                pass
            else:
                cleantext = cleantext + line #+ '\n'
    cleantext = str(cleantext)
    text1 = re.sub('\\S*@\\S*\\s?', '', cleantext)  # Remove Emails
    text1 = re.sub("\'", "", text1)                 #remove single quotes
    text1 = re.sub('\\s+', ' ', text1)              #remove new line character
    text1 = re.sub(r'http\S+', '', text1)           #remove URLs
    text1 = tokenize(str(text1))
    text1 = str(text1)
    #using gensim to remove numbers, punctuation, whitespace, stopwords,
    #non-alfa, convert lowercase and stem
    text1 = ' '.join(preprocess_string(str(text1)))
    return text1

示例#24

0

显示文件

文件： lda_supervised.py 项目： xmliszt/capstone-topic-modelling

def clean_texts(texts: list) -> list:
    clean_texts = []
    for text in texts:
        processed_texts = preprocess_string(text, CUSTOM_FILTERS)
        processed_texts = [w for w in processed_texts if not w in STOP_WORDS]
        clean_texts.append(processed_texts)
    return clean_texts

示例#25

0

显示文件

文件： jiffylub.py 项目： karnation22/osg

def output_csv_files(output_dfs, output_df_strs):
    assert (len(output_dfs) == len(output_df_strs))
    sia = SIA()
    for index in range(len(output_dfs)):
        preprocsplit = (lambda rev: preprocess_string(str(rev)))
        output_df, stem_name = output_dfs[index], output_df_strs[index]
        output_df[2] = (output_df[1].astype(str).apply(preprocsplit))
        new_output_df = pd.DataFrame()  #new output dataframe...
        for (_, row) in output_df.iterrows():
            if (('unprofession' in list(row[2])
                 or 'profession' in list(row[2]))):
                new_output_df = new_output_df.append(row, ignore_index=True)
        misclassified_df = pd.DataFrame()
        for (_, row) in new_output_df.iterrows():
            if (('unprofession' in list(row[2])
                 and sia.polarity_scores(row[0])['compound'] > 0.0)
                    or ('profession' in list(row[2])
                        and sia.polarity_scores(row[0])['compound'] < 0.0)):
                misclassified_df = misclassified_df.append(row,
                                                           ignore_index=True)
        f_name = 'misclassified_' + stem_name + '.csv'
        print(type(misclassified_df))
        print(f_name)
        misclassified_df.to_csv(f_name)
    return

示例#26

0

显示文件

def trainD2V(fileName):
    with open(fileName, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=';')
        corpus = []
        for row in reader:
            #print(row[5])
            if row[5] and len(row[5]) > 10:
                pp_news = pp.preprocess_string(row[5], CUSTOM_FILTERS)  #

                if (len(pp_news) > 2):
                    corpus.append(pp_news)

    tagged_documents = []
    for i, doc in enumerate(corpus):
        tagged = TaggedDocument(doc, [i])
        tagged_documents.append(tagged)

    dv = Doc2Vec(tagged_documents,
                 vector_size=100,
                 window=3,
                 min_count=10,
                 workers=4,
                 epochs=100)
    dv.train(tagged_documents,
             total_examples=dv.corpus_count,
             epochs=dv.epochs)

    return dv

示例#27

0

显示文件

文件： backup(chatbot).py 项目： riti121/cafe

def chatcode():
    global name;
    print('\n\nHello! Thanks for coming here. I am a chatbot. People say that '
      'I am a kind and approachable bot.')
    name = input('Please tell me your name.\n')
    try:
        preprocessed = [word for word in preprocess_string(name) if word not in (
                    'people', 'call', 'friend')][0]
        name = [word for word in strip_non_alphanum(name.lower()).split(
            ) if preprocessed in word][0]
    except:
        name = name.split()[0]
    name = name[0].upper() + name[1:]
    print("Hi " + name + "! My name's CAFE BUDDY. Let's start with our session.")
    response = input("How are you doing?\n")
    if (predict(response) >= 0.55):
        response = input('That is good. Are you usually this happy, or are there '\
                     'some worries that you want to talk about?\n')
        if (predict(response)>=0.7):
            response = input('You seem to be really content. Wanna sign off?\n')
            if(predict(response)>=0.7):
                print('Ok, bye ' + name + '!')
            else:
                response = input('Is there something bothering you? Would you '\
                             'share it with me?\n')
                if(predict(response)>=0.7):
                    print("That's okay. It was nice talking to you. You can chat "\
                      "with me anytime you want.\n Bye" + name + "!")
                else:
                    sad1()
        else:
            sad1()
    else:
        sad3()

示例#28

0

显示文件

def trainW2V(fileName):
    with open(fileName, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=';')
        corpus = []

        for row in reader:
            #print(row[5])
            try:
                if index_in_list(row, 5) and len(row[5]) > 10:
                    pp_news = pp.preprocess_string(row[5], CUSTOM_FILTERS)  #

                    if (len(pp_news) > 2):
                        corpus.append(pp_news)
            except:
                print("skipped: " + row[0])

    EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
    google_model = Word2Vec(size=300, window=5, min_count=2, workers=-1)
    google_model.build_vocab(corpus)
    google_model.intersect_word2vec_format(EMBEDDING_FILE,
                                           lockf=1.0,
                                           binary=True)
    google_model.train(corpus,
                       total_examples=google_model.corpus_count,
                       epochs=5)
    return google_model.wv.wv

示例#29

0

显示文件

def getNewsRecommendationDoc2Vec(fileName, email, preference, fileRatings,
                                 alreadyLiked):
    userNews = [
    ]  # Contiene una lista di news per un utente con [email, link news, cosine]
    documents = []
    links = []
    count = 0
    query = calculate_centroid(preference, 3, dv)
    with open(fileName, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=';')
        for row in reader:

            if index_in_list(row, 5) and len(row[5]) > 10:
                news = [
                ]  # creo lista vuota che ocnterrà le info dell'utente per una determinata news

                pp_news = pp.preprocess_string(
                    row[5], CUSTOM_FILTERS
                )  # Faccio il pre-processing della descrizione della notizia

                if len(pp_news) > 2:
                    # Calcolo il centroide per ogni news
                    newsVector = calculate_centroid(pp_news, 3, dv)
                    try:
                        # Calcolo la cosine similarity tra il centroide della news ed il centroide della preferenza dell'utente
                        cos_sim = 1 - spatial.distance.cosine(
                            query, newsVector)
                    except:
                        cos_sim = 0
                    # Info sulla news
                    news.append(email)  # email
                    news.append(row[1])  # link
                    news.append(cos_sim)  # cosine similarity

                    # Inserisco la news in una lista di news
                    userNews.append(news)

    file.close()

    # Ordino in ordine decrescente la cosine similarity
    userNews.sort(key=itemgetter(2), reverse=True)

    with io.open(fileRatings, "a", encoding="utf-8") as myfile:
        i = 0
        for news in userNews:
            if i > 4:
                break

            if not (news[1] in alreadyLiked):
                myfile.write(news[0] + ";")  # email
                myfile.write(news[1] + ";")  # link
                myfile.write('{} \n'.format(news[2]))  # cosine similarity
                # print(news[1]) # descrizione pre-processata

                i = i + 1

    print('Scrittura in ' + fileRatings + ' avvenuta per ' + email + '!')
    myfile.close()

    return ''

示例#30

0

显示文件

文件： util.py 项目： Badodon/FFNN

def load_data(fname):
    
    print 'input file name:', fname

    target = [] #ラベル
    source = [] #文書ベクトル

    #文書リストを作成
    document_list = []
    word_list = []
    for l in open(fname, 'r').readlines():
        sample = l.strip().split(' ',  1)
        label = sample[0]
        target.append([label]) #ラベル
        word_list = preprocess_string(sample[1]) #ストップワード除去, ステミング
        document_list.append(word_list) #文書ごとの単語リスト
    
    #辞書を作成
    #低頻度と高頻度のワードは除く
    dct = Dictionary(document_list)
    dct.filter_extremes(no_below=3, no_above=0.6)

    #文書のBOWでベクトル化
    for doc in document_list:
        tmp = dct.doc2bow(doc) # ex.[(4, 1), (23,1),..., (119,2)] 
        dense = list(matutils.corpus2dense([tmp], num_terms=len(dct)).T[0])
        source.append(dense)

    dataset = {}
    dataset['target'] = np.array(target)    
    dataset['source'] = np.array(source)    

    return dataset #, max_len, width

示例#31

0

显示文件

文件： load_and_preprocess.py 项目： hellodannyliu/topic-traceability

def parse_and_load_discussion_answers(course_data_path, conn, course_zip_name):
    """load, parse, process discussion answers
    """
    course_slug = course_zip_name.replace("_", "-")
    sql_select_discussion_answer = (
        "SELECT discussion_answer_id, discussion_answer_content " +
        "FROM discussion_answers, courses WHERE " +
        "discussion_answers.course_id == courses.course_id AND " +
        "courses.course_slug == (?)")

    c = conn.cursor()
    c.execute(sql_select_discussion_answer, (course_slug, ))

    course_answers = {}

    rows = c.fetchmany()
    while rows:
        for row in rows:
            answer_id, answer_content = row
            course_answers[answer_id] = preprocess_string(answer_content)
        rows = c.fetchmany()

    # save the course_answers to disk
    answers_filepath = os.path.join(course_data_path, "..",
                                    "answers.{}.json".format(course_slug))
    with open(answers_filepath, "w") as answers_file:
        json.dump(course_answers, answers_file)

示例#32

0

显示文件

文件： test_lee.py 项目： EricChanBD/gensim

    def setUp(self):
        """setup lee test corpora"""
        global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2

        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
        bg_corpus_file = 'lee_background.cor'
        corpus_file = 'lee.cor'
        sim_file = 'similarities0-1.txt'

        # read in the corpora
        with open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus = preprocess_documents(f)
        with open(os.path.join(pre_path, corpus_file)) as f:
            corpus = preprocess_documents(f)
        with open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus2 = [preprocess_string(s, filters=DEFAULT_FILTERS[:-1]) for s in f]
        with open(os.path.join(pre_path, corpus_file)) as f:
            corpus2 = [preprocess_string(s, filters=DEFAULT_FILTERS[:-1]) for s in f]

        # read the human similarity data
        sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
        sim_m_size = np.shape(sim_matrix)[0]
        human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]

示例#33

0

显示文件

文件： get_wiki_articles_run.py 项目： quesada/runs-gensim

def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    logger = tools.get_logger('gensim', os.path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # initializations
    articles = {}
    all_missing = []
    redir_on = {}
    collisions = {}
    non_ascii = []
    site = mwclient.Site('en.wikipedia.org', '/w/api.php/')

    # get all txt files in a folder and iterate over them
    filelist = glob.glob(os.path.join(base_path,
                                      p['folder_path'],
                                      "*.txt"))
    for f in filelist:

        # get the word we are working on
        f_name = os.path.basename(f)
        k_word = os.path.splitext(f_name)[0]
        logger.info("working on file: %s" % f_name)

        # try to convert the word into ascii for the http query
        file_obj = codecs.open(f, "r", "utf-16")
        counter = 0
        words = []
        for w in file_obj.readlines():
            try:
                s = w.strip().decode('ascii')
                words.append(s)
            except Exception:
                counter += 1
                non_ascii.append(w.strip())
        logger.info("\t%d words containing non ascii are ommited" % counter)

        articles[k_word] = {}
        logger.info("\tfound %d words in file" % len(words))

        for word in words:
            data = {}
            page = site.Pages[word]

            # follow the redirect and check for collisions
            if page.redirect:
                res = re.search('\[\[(.+)\]\]', page.edit())
                redir_word = urllib.unquote(res.groups()[0])
                if redir_word in redir_on:
                    logger.warning("[%s AND %s] both redirect on --> %s" %
                                    (word, redir_on[redir_word], redir_word))
                    collisions[redir_word] = redir_on[redir_word]
                else:
                    logger.info("[%s] redir from [%s]" % (redir_word, word))
                    redir_on[redir_word] = word
                text = site.Pages[redir_word].edit()
                data['redirected'] = redir_word

            else:
                text = page.edit()

            # check for missing wikipedia articles
            if  text == "":
                all_missing.append(word)
                continue

            # preprocess the received article
            data['text'] = wikicorpus.filter_wiki(text)
            in_ascii = ud.normalize('NFKD',
                                    data['text']).encode('ascii', 'ignore')
            data['text'] = preprocess_string(in_ascii)
            articles[k_word][word] = data

    logger.info('add human rating to the articles')
    id_word = {}
    sparql_path = os.path.join(base_path, p['sparql_path'])
    with open(os.path.join(sparql_path, 'id_word.txt')) as f:
        for line in f.readlines():
            idx, word = line.strip().split('\t')
            id_word[idx] = word

    #add human rating to the wikipedia data
    not_found = []
    with open(os.path.join(sparql_path, p['human_file'])) as f:
        for line in f.readlines():
            arr = line.split()
            word = id_word[arr[0]]
            term = arr[3]
            try:
                articles[word][term]['rating'] = int(arr[4])
            except KeyError:
                not_found.append(term)
    logger.info("%d words from the ref queries not found" % len(not_found))

    f = open(os.path.join(output_dir, "articles.pickle"), 'wb')
    pickle.dump(articles, f)
    f.close

    info = {}
    info['missing'] = all_missing
    info['redirs'] = redir_on
    info['collisions'] = collisions
    info['not_found'] = not_found
    info['non_ascii'] = non_ascii
    f = open(os.path.join(output_dir, "info.pickle"), 'wb')
    pickle.dump(info, f)
    f.close

    logger.info("%d redirecting collisions (see info.pkl)" % len(collisions))

示例#34

0

显示文件

文件： sparql_wiki.py 项目： dedan/dedan_runs

            
            # download the content of the article
            
            # some redirects introduce no ascii characters 
            # TODO introduce a proper conversion of this characters
            try:
                title = title.decode('ascii')
            except Exception:
                continue
                
            query = (query_base + "&export") % title
            text    = myopener.open(query).read()
            soup    = BSS(text, convertEntities=BSS.ALL_ENTITIES)
            export  = BSS(soup.api.query.export.prettify())
            text    = BSS(export.mediawiki.page.revision.prettify())
            if text.revision.minor:
                data['text'] = wikicorpus.filterWiki(text.revision.minor.text)
            else:
                data['text'] = wikicorpus.filterWiki(text.revision.text)
            in_ascii = unicodedata.normalize('NFKD', data['text']).encode('ascii', 'ignore')
            data['text'] = preprocess_string(in_ascii)
            articles[k_word][title] = data

f = open(results_path + "sparql_wiki.pickle", 'wb')
pickle.dump(articles, f)
f.close

print sum(all_missing, [])