def load_all_query_annotated_robust4(
        file='/local/karmim/Stage_M1_RI/data/topics-title.annotated.csv',
        pre_process=True,
        CUSTOM_FILTERS=[lambda x: x.lower(), remove_stopwords],
        delete_meaning=True):
    query_an = {}  # Dict with words and concept for a query id
    concept = {}  # Dict with only the concept for a query id
    f = codecs.open(file, 'r', encoding='utf-8', errors='ignore')
    for line in f:
        #print(line.split())
        line = np.array(line.split())
        index = np.where(np.char.find(line, '$#') >= 0)
        concept[line[0]] = list(line[index])
        query_an[line[0]] = list(line[1:])

        if delete_meaning:
            concept[line[0]] = [w[:-5] for w in concept[line[0]]]
            query_an[line[0]] = [
                w[:-5] if '$#' in w else w for w in query_an[line[0]]
            ]
        if pre_process:
            for k in query_an:
                query_an[k] = preprocess_string(' '.join(query_an[k]),
                                                CUSTOM_FILTERS)
                concept[k] = preprocess_string(' '.join(concept[k]),
                                               [lambda x: x.lower()])

    return query_an, concept
    def setUp(self):
        """setup lee test corpora"""
        global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2

        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
        bg_corpus_file = 'lee_background.cor'
        corpus_file = 'lee.cor'
        sim_file = 'similarities0-1.txt'

        # read in the corpora
        latin1 = partial(utils.to_unicode, encoding='latin1')
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus2 = [
                preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1])
                for s in f
            ]
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus2 = [
                preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1])
                for s in f
            ]

        # read the human similarity data
        sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
        sim_m_size = np.shape(sim_matrix)[0]
        human_sim_vector = sim_matrix[np.triu_indices(sim_m_size, 1)]
def parse_and_load_discussion_questions(course_data_path, conn,
                                        course_zip_name):
    """load, parse, process discussion questions
    """
    course_slug = course_zip_name.replace("_", "-")
    sql_select_discussion_question = (
        "SELECT discussion_question_id, discussion_question_title, " +
        "discussion_question_details " +
        "FROM discussion_questions, courses WHERE " +
        "discussion_questions.course_id == courses.course_id AND " +
        "courses.course_slug == (?)")

    c = conn.cursor()
    c.execute(sql_select_discussion_question, (course_slug, ))

    course_questions = {}

    rows = c.fetchmany()
    while rows:
        for row in rows:
            question_id, question_title, question_details = row
            course_questions[question_id] = (
                preprocess_string(question_title) +
                preprocess_string(question_details))
        rows = c.fetchmany()

    # save the course_questions to disk
    questions_filepath = os.path.join(course_data_path, "..",
                                      "questions.{}.json".format(course_slug))
    with open(questions_filepath, "w") as questions_file:
        json.dump(course_questions, questions_file)
示例#4
0
def main(args):
    doc_dir = args.doc_dir
    docs = defaultdict(list)
    for file in os.listdir(doc_dir):
        title, abstract = parse_xml(os.path.join(doc_dir, file))
        key = int(re.search(r'\d+', file).group())
        title = preprocess_string(title)
        abstract = preprocess_string(abstract)
        docs[key] = [title, abstract]

    train_word2vec(docs, args.model_file)
def preprocess(s, stem=True):
    '''
    given a document or query string, returns a list of preprocessed words.
    we can decide whether to stem each word or not.
    '''
    if not stem:
        preprocess_filters = DEFAULT_FILTERS.copy()
        preprocess_filters.pop()  # remove stemming from list of filters
        wordList = preprocess_string(s, filters=preprocess_filters)
    else:
        wordList = preprocess_string(s)
    for i in range(len(wordList)):
        wordList[i] = deaccent(wordList[i])
    return wordList
示例#6
0
    def preprocess_column(self, pd_data, load_model=False):
        """
        Preprocess specified column.

        Inputs:
            pd_data: (pd.Series) Input data to preprocess.

        Returns:
            pd_data: (pd.Series) Preprocess data.
        """
        # preprocess using set of filters
        custom_filters = self._build_custom_filter_list()

        log.info('Applying preprocess filters to the %s...', pd_data.name)
        pd_data = pd_data.apply(
            lambda x: gpp.preprocess_string(x, custom_filters),
            convert_dtype=False)

        # generate phrase based on the configuration
        pd_data = self._generate_phrase(pd_data, load_model=load_model)

        # join the list of words into space delimited string
        pd_data = pd_data.apply(lambda x: ' '.join(x))

        return pd_data
示例#7
0
    def _tokenize(document: dict, phraser=None):
        text_information = [value for key, value in document.items()]
        text = " ".join(text_information)


        def _custom_strip_short(s):
            return strip_short(s, minsize=2)


        def _custom_strip_numeric(s):
            RE_NUMERIC = re.compile(r' [0-9]+( [0-9]+)*(\.)? ', re.UNICODE)
            s = utils.to_unicode(s)
            return RE_NUMERIC.sub(" ", s)


        # most of the preprocessing is done already
        # strip_tags removes style definitions etc as well which is good
        CUSTOM_FILTERS = [strip_tags, _custom_strip_short, _custom_strip_numeric]
        preprocessed_text = preprocess_string(text, CUSTOM_FILTERS)

        if phraser:
            tokens = phraser.phrase(preprocessed_text)
            return [token.replace(' ', '_') for token in tokens]
        else:
            return preprocessed_text
示例#8
0
def find_nearest_words(model, work):
    # tags = parse_tags()
    # tax = parse_taxonomy()
    # ids = tags[work]['ids']
    # ts = [tax[d] for d in ids if 'stemning' in tax[d]]

    text = MAPPER.get_text(work)
    # print(text)
    # print(ts)

    M, labels = infer_taxonomy_vectors(model, taxonomy_path=None)
    max_dists = defaultdict(lambda: 0)

    for token in pre.preprocess_string(text, filters=FILTERS):
        if token in model.wv:
            v = model.wv[token].reshape(1, -1)
            dists = cosine_similarity(M, v)
            am = np.argmax(dists)
            max_dists[labels[am]] = max(max_dists[labels[am]], np.max(dists))

    md = sorted([(k, d) for k, d in max_dists.items()], key=lambda x: x[1])
    i = 0
    rtags = []
    for k, v in md:
        if i > 10:
            break
        if 'stemning::' in k and v > 0.5:
            i += 1

            rtags.append(k)
            # print(k, v)
    return rtags
def custom_tokenizer(s):
    return [
        w.translate(table) for w in preprocess_string(s, [
            strip_tags, lambda x: strip_short(x, 2), remove_stopwords,
            lambda x: ks.stem(x)
        ])
    ]
示例#10
0
    def remove(self, text):
        """[summary]

        Args:
            text ([type]): [description]

        Returns:
            [type]: [description]
        """
        # if text:
        CUSTOM_FILTERS = [
            lambda x: x.lower(),  # lowercase
            strip_multiple_whitespaces,
            strip_non_alphanum,
            strip_numeric,
            remove_stopwords,
            strip_short,
            #                   stem_text
        ]
        text = text.lower()
        example_sent = preprocess_string(text, CUSTOM_FILTERS)
        filtered_sentence = [
            w for w in example_sent if not w in self.get_stopwords()
        ]

        return filtered_sentence
示例#11
0
def litsiden():
    f = pr.resource_filename('matext', 'data/litteratursiden_pidmap.json')
    mapper = LitteraturSidenMapper()
    works = []
    df = {}

    lk_frac = None
    with open(f) as fh:
        data = json.load(fh)
        lk_frac = [
            len(data['train']) + len(data['dev']),
            len(data['train']) + len(data['dev']) + len(data['corpus'])
        ]

        for k in data.keys():
            works += data[k]
    for w in works:
        t = mapper.get_text(w)
        tokens = pre.preprocess_string(t, filters=FILTERS)

        df[w] = [len(tokens), len(t)]

    df = pd.DataFrame.from_dict(df, orient='index')
    df.columns = ['tokens', 'char-length']
    print(df.describe())
    fig, ax = plt.subplots()
    sns.distplot(df['char-length'],
                 ax=ax,
                 bins=100,
                 kde=False,
                 norm_hist=False)
    fig.savefig('lit_hist.png')
    print('LK', lk_frac)
def preprocess(datafolder):
    docs = {}
    nlp = load('en')

    for file in os.listdir(
            datafolder):  #going through all the files in the folder
        filepath = os.path.join(datafolder, file)
        if not file.startswith('.'):
            document = loadDoc(filepath)
            sentenceSplit = list(nlp(document).sents)
            gensimSettings = [
                lambda x: x.lower(),
                genPreProc.remove_stopwords,
                genPreProc.
                stem,  #making the text uniform and removing stopwords
                genPreProc.strip_non_alphanum,
                genPreProc.strip_multiple_whitespaces
            ]
            sentencePreprocess = [
                ' '.join(
                    preprocess_string(str(sentence), filters=gensimSettings))
                for sentence in sentenceSplit
            ]

            docs[os.path.basename(filepath)] = sentencePreprocess
    return docs
示例#13
0
def process_input(row):
    input_merged = row['Assignment Name'] + ' ' + row['School Category']
    
    # gensim's preprocess_string through series of txt_filters which generates tokens array
    input_processed_tokens = " ".join(preprocess_string(input_merged, txt_filters))
    
    return input_processed_tokens
示例#14
0
def remove_text_duplicates_retain_order(texts):
	"""
	Remove the duplicates from a list of text strings, where duplicates are defined as two text strings
	that differ only by puncutation, capitalization, or the length of whitespaces. This is useful for 
	not retaining extra text information just because its not perfectly identical to some existing string.
	Duplicates are removed such that the first occurence is retained, and that determines the final
	ordering. The texts that are returned are not processed, and are a subset of the original list of 
	text strings. The strings retained determined which version of that duplicate in terms of punctuation,
	capitalization, and whitespace is retained in the final list.
	
	Args:
	    texts (list of str): A list of arbitrary strings.
	
	Returns:
	    list of str: A subset of the original list, with duplicates as defined above removed.
	"""
	# Create a list of cleaned texts that corresponds to the list of texts passed in.
	filters = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces]
	cleaned_texts = [" ".join(preprocess_string(text, filters)) for text in texts]
	assert len(texts) == len(cleaned_texts)


	# Get a dictionary mapping the cleaned texts to the a list of the original texts that they resulted from.
	cleaned_to_originals = defaultdict(list)
	for cleaned_text,text in zip(cleaned_texts,texts):
		cleaned_to_originals[cleaned_text].append(text)

	# Remove duplicates and retain the order of the list of the cleaned texts.
	cleaned_texts_no_duplicates = remove_duplicates_retain_order(cleaned_texts)

	# Using whatever the first observed instance of original text that resulting in each cleaned text, rebuild the list.
	original_texts_with_same_removals = [cleaned_to_originals[cleaned_text][0] for cleaned_text in cleaned_texts_no_duplicates]
	return(original_texts_with_same_removals)
示例#15
0
 def transform(self, df_x):
     return np.asmatrix(
         np.array([
             self._model.infer_vector(
                 preprocess_string(row['reviews_content']))
             for index, row in df_x.iterrows()
         ]))
def read_documents(path):
    dataset = []
    filter = [
        lambda x: x.lower(),
        strip_multiple_whitespaces,
        strip_numeric,
        strip_non_alphanum,
        strip_punctuation,
        remove_stopwords,
        strip_tags,
        lambda s: strip_short(s, minsize=4),
    ]
    LEN_THRESHOLD = 10
    for root, dirs, files in os.walk(path):
        if os.path.basename(root) in ["Cog", "NotCog"]:
            print(root)
            for f in files:
                with open(os.path.join(root, f), "r") as myfile:
                    text = myfile.read()
                    text = re.sub(r"[^\x00-\x7F]+", " ", text)
                    res = []
                    doc = nlp(text)
                    for sent in doc.sents:
                        sent = " ".join([word.lemma_ for word in sent])
                        res.append(" ".join(
                            preprocess_string(sent, filters=filter)))
                    text = "\n".join(res)
                    label = 0 if os.path.basename(root) == "Cog" else 1
                    if len(text) < LEN_THRESHOLD:
                        print(f)
                        continue
                    dataset.append((text, label))
    random.shuffle(dataset)
    texts, labels = zip(*dataset)
    return texts, labels
示例#17
0
def cleaner(path, d):
    with codecs.open(path, encoding='utf8') as f:
        text = f.read()
        text1 = unidecode.unidecode(text)

        # remove roman numerals
        text1 = removeRomanNumerals(text1)

        # strip numbers, whitespace, and punctuation
        EMBEDDING_FILTERS = [
            lambda x: x.lower(), strip_numeric, strip_multiple_whitespaces,
            strip_punctuation
        ]
        c = preprocess_string(text1, EMBEDDING_FILTERS)

        # replace and correct words
        c = replaceWordsFromMap(c, correctionDict)
        c = replaceWordsFromMap(c, syncopateDict)
        c = replaceWordsFromMap(c, variantDict)
        c = replaceWordsFromMap(c, variantDict2)

        d = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in c]

        t = " ".join(d)

        tokens = parallelRemove(d, 4)

        return tokens, t
示例#18
0
def flat_doc(document, model, extremes=None):
	flat_doc = ""
	for field in document:
		if not isinstance(document[field], list): continue #No tomamos en cuenta los campos 'id' y '_version_': auto-generados por Solr
		for value in document[field]:
			## Detección y traducción ##
			if field=='author.authors.authorName' or field=='author.authorBio' or field=='description' or field=='quotes.quoteText':
				value_blob = TextBlob(value)
				try:
					if value_blob.detect_language() != 'en':
						try: 
							value = value_blob.translate(to='en')
						except Exception as e: 
							value = value #e = NotTranslated('Translation API returned the input string unchanged.',)
				except Exception as e:
					value = value #e = TranslatorError('Must provide a string with at least 3 characters.')
			############################
			flat_doc += str(value)+' ' #Se aplana el documento en un solo string
	flat_doc = preprocess_string(flat_doc, CUSTOM_FILTERS) #Preprocesa el string
	flat_doc = [w for w in flat_doc if w not in stop_words] #Remueve stop words
	if extremes:
		flat_doc = [w for w in flat_doc if w not in extremes]
	flat_doc = [w for w in flat_doc if w in model.vocab] #Deja sólo palabras del vocabulario
	if flat_doc == []:
		flat_doc = ['book'] #Si el libro queda vacío, agregarle un token para no tener problemas más adelante
	return flat_doc
    def tokenize(self, text):
        """Tokenizes the provided text

        Args:
            text (str): The text to be tokenized

        Returns:
            list(tuple(str, int)): A list of (token, count) pairs from the text without the stopwords.

        """

        # make everything lowercase and strip punctuation
        CUSTOM_FILTERS = [lambda x: x.lower(), strip_punctuation]
        tokens = preprocess_string(text, CUSTOM_FILTERS)

        # filter out all stopwords
        filtered_tokens = [w for w in tokens if not w in self.__stopwords]

        # count the term frequency in the text
        count = { }
        for word in filtered_tokens:
            if word not in count:
                count[word] = 0
            count[word] += 1

        # sort the terms in descending order
        terms_sorted = sorted(count.items(), key=operator.itemgetter(1), reverse=True)
        return terms_sorted
示例#20
0
def create_bigrams_and_remove_Stopwords(raw_transcripts_gensim):
    #creating phrases like good_afternoon, hdfc_life so that they can be removed as part of custom stop words.
    Preprocessed_Transcripts = []
    bigrams = []
    import gensim, pprint
    for transcripts in raw_transcripts_gensim["Lemmatized_transcript"]:
        tokens = [list(gensim.utils.tokenize(transcripts, lower=True))]

        bigram_mdl = gensim.models.phrases.Phrases(tokens,
                                                   min_count=1,
                                                   threshold=5)
        #Preprocessed_Transcripts.append(token_bigrams(transcripts))
        from gensim.parsing.preprocessing import preprocess_string, remove_stopwords
        CUSTOM_FILTERS = [remove_stopwords]
        #if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3
        tokens = [
            preprocess_string(" ".join(word), CUSTOM_FILTERS)
            for word in tokens
        ]
        bigrams = bigram_mdl[tokens]
        Preprocessed_Transcripts.append(list(bigrams))

    #final step of preprocessing to remove stop words
    Final_Preprocessed_Transcripts = []
    for tokenised_text in Preprocessed_Transcripts:
        #print(tokenised_text)
        for token in tokenised_text:
            Final_Preprocessed_Transcripts.append(preprocess(token))

    #Appending the final preprocessed transcript to the dataframe
    raw_transcripts_gensim[
        "Preprocessed Transcripts"] = Final_Preprocessed_Transcripts

    return raw_transcripts_gensim
def getExistingWordsFromModel(words):
    """ Checks if a list of words are in the dictionary of the word2vec model """
    CUSTOM_FILTERS = [lambda x: strip_numeric, remove_stopwords]
    res = []
    for w in words:
        try:
            vec = word_vectors[w]
            res.append(w)
        except:
            try:
                w_transformed = w.replace(".", "").replace("=", "").replace(
                    "-", "").replace("*", "").replace("'", "").replace(
                        "`", "").replace("|", "").replace('\\', "").replace(
                            "/", "").replace("$", "").replace("^", "").replace(
                                "&", "").replace("@", "").replace("%", "")
                vec = word_vectors[w_transformed]
                res.append(w_transformed)
            except:
                try:
                    w_stripped = preprocess_string(w_transformed,
                                                   CUSTOM_FILTERS)
                    vec = word_vectors[w_stripped]
                    res.append(w_stripped)
                except:
                    continue
    return res
示例#22
0
 def __iter__(self):
     with open(self.filename) as file:
         for line in file:
             line = preprocess_string(line, [
                 lambda x: x.lower(), strip_tags, strip_multiple_whitespaces
             ])
             yield ['<SOS>', *line, '<EOS>']
示例#23
0
def cleanlines2(text):
    '''Clean text removing urls, punctation, numbers, whitespace and convert
    to lowecase'''
    text1 = str(text).lower()

    lines = []   #split in lines
    for line in text1.split('\n'):
        line = str(line)
        line = line.strip('\n')
        if line:
            lines.append(line)
    cleantext = ''
    for line in lines:
        filterreg = config.LABELREGEX.search(line)
        if filterreg is None:
            cleantext = cleantext + line #+ '\n'
        else:
            if filterreg.group():
                pass
            else:
                cleantext = cleantext + line #+ '\n'
    cleantext = str(cleantext)
    text1 = re.sub('\\S*@\\S*\\s?', '', cleantext)  # Remove Emails
    text1 = re.sub("\'", "", text1)                 #remove single quotes
    text1 = re.sub('\\s+', ' ', text1)              #remove new line character
    text1 = re.sub(r'http\S+', '', text1)           #remove URLs
    text1 = tokenize(str(text1))
    text1 = str(text1)
    #using gensim to remove numbers, punctuation, whitespace, stopwords,
    #non-alfa, convert lowercase and stem
    text1 = ' '.join(preprocess_string(str(text1)))
    return text1
def clean_texts(texts: list) -> list:
    clean_texts = []
    for text in texts:
        processed_texts = preprocess_string(text, CUSTOM_FILTERS)
        processed_texts = [w for w in processed_texts if not w in STOP_WORDS]
        clean_texts.append(processed_texts)
    return clean_texts
示例#25
0
def output_csv_files(output_dfs, output_df_strs):
    assert (len(output_dfs) == len(output_df_strs))
    sia = SIA()
    for index in range(len(output_dfs)):
        preprocsplit = (lambda rev: preprocess_string(str(rev)))
        output_df, stem_name = output_dfs[index], output_df_strs[index]
        output_df[2] = (output_df[1].astype(str).apply(preprocsplit))
        new_output_df = pd.DataFrame()  #new output dataframe...
        for (_, row) in output_df.iterrows():
            if (('unprofession' in list(row[2])
                 or 'profession' in list(row[2]))):
                new_output_df = new_output_df.append(row, ignore_index=True)
        misclassified_df = pd.DataFrame()
        for (_, row) in new_output_df.iterrows():
            if (('unprofession' in list(row[2])
                 and sia.polarity_scores(row[0])['compound'] > 0.0)
                    or ('profession' in list(row[2])
                        and sia.polarity_scores(row[0])['compound'] < 0.0)):
                misclassified_df = misclassified_df.append(row,
                                                           ignore_index=True)
        f_name = 'misclassified_' + stem_name + '.csv'
        print(type(misclassified_df))
        print(f_name)
        misclassified_df.to_csv(f_name)
    return
示例#26
0
def trainD2V(fileName):
    with open(fileName, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=';')
        corpus = []
        for row in reader:
            #print(row[5])
            if row[5] and len(row[5]) > 10:
                pp_news = pp.preprocess_string(row[5], CUSTOM_FILTERS)  #

                if (len(pp_news) > 2):
                    corpus.append(pp_news)

    tagged_documents = []
    for i, doc in enumerate(corpus):
        tagged = TaggedDocument(doc, [i])
        tagged_documents.append(tagged)

    dv = Doc2Vec(tagged_documents,
                 vector_size=100,
                 window=3,
                 min_count=10,
                 workers=4,
                 epochs=100)
    dv.train(tagged_documents,
             total_examples=dv.corpus_count,
             epochs=dv.epochs)

    return dv
示例#27
0
def chatcode():
    global name;
    print('\n\nHello! Thanks for coming here. I am a chatbot. People say that '
      'I am a kind and approachable bot.')
    name = input('Please tell me your name.\n')
    try:
        preprocessed = [word for word in preprocess_string(name) if word not in (
                    'people', 'call', 'friend')][0]
        name = [word for word in strip_non_alphanum(name.lower()).split(
            ) if preprocessed in word][0]
    except:
        name = name.split()[0]
    name = name[0].upper() + name[1:]
    print("Hi " + name + "! My name's CAFE BUDDY. Let's start with our session.")
    response = input("How are you doing?\n")
    if (predict(response) >= 0.55):
        response = input('That is good. Are you usually this happy, or are there '\
                     'some worries that you want to talk about?\n')
        if (predict(response)>=0.7):
            response = input('You seem to be really content. Wanna sign off?\n')
            if(predict(response)>=0.7):
                print('Ok, bye ' + name + '!')
            else:
                response = input('Is there something bothering you? Would you '\
                             'share it with me?\n')
                if(predict(response)>=0.7):
                    print("That's okay. It was nice talking to you. You can chat "\
                      "with me anytime you want.\n Bye" + name + "!")
                else:
                    sad1()
        else:
            sad1()
    else:
        sad3()
示例#28
0
def trainW2V(fileName):
    with open(fileName, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=';')
        corpus = []

        for row in reader:
            #print(row[5])
            try:
                if index_in_list(row, 5) and len(row[5]) > 10:
                    pp_news = pp.preprocess_string(row[5], CUSTOM_FILTERS)  #

                    if (len(pp_news) > 2):
                        corpus.append(pp_news)
            except:
                print("skipped: " + row[0])

    EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
    google_model = Word2Vec(size=300, window=5, min_count=2, workers=-1)
    google_model.build_vocab(corpus)
    google_model.intersect_word2vec_format(EMBEDDING_FILE,
                                           lockf=1.0,
                                           binary=True)
    google_model.train(corpus,
                       total_examples=google_model.corpus_count,
                       epochs=5)
    return google_model.wv.wv
示例#29
0
def getNewsRecommendationDoc2Vec(fileName, email, preference, fileRatings,
                                 alreadyLiked):
    userNews = [
    ]  # Contiene una lista di news per un utente con [email, link news, cosine]
    documents = []
    links = []
    count = 0
    query = calculate_centroid(preference, 3, dv)
    with open(fileName, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=';')
        for row in reader:

            if index_in_list(row, 5) and len(row[5]) > 10:
                news = [
                ]  # creo lista vuota che ocnterrà le info dell'utente per una determinata news

                pp_news = pp.preprocess_string(
                    row[5], CUSTOM_FILTERS
                )  # Faccio il pre-processing della descrizione della notizia

                if len(pp_news) > 2:
                    # Calcolo il centroide per ogni news
                    newsVector = calculate_centroid(pp_news, 3, dv)
                    try:
                        # Calcolo la cosine similarity tra il centroide della news ed il centroide della preferenza dell'utente
                        cos_sim = 1 - spatial.distance.cosine(
                            query, newsVector)
                    except:
                        cos_sim = 0
                    # Info sulla news
                    news.append(email)  # email
                    news.append(row[1])  # link
                    news.append(cos_sim)  # cosine similarity

                    # Inserisco la news in una lista di news
                    userNews.append(news)

    file.close()

    # Ordino in ordine decrescente la cosine similarity
    userNews.sort(key=itemgetter(2), reverse=True)

    with io.open(fileRatings, "a", encoding="utf-8") as myfile:
        i = 0
        for news in userNews:
            if i > 4:
                break

            if not (news[1] in alreadyLiked):
                myfile.write(news[0] + ";")  # email
                myfile.write(news[1] + ";")  # link
                myfile.write('{} \n'.format(news[2]))  # cosine similarity
                # print(news[1]) # descrizione pre-processata

                i = i + 1

    print('Scrittura in ' + fileRatings + ' avvenuta per ' + email + '!')
    myfile.close()

    return ''
示例#30
0
文件: util.py 项目: Badodon/FFNN
def load_data(fname):
    
    print 'input file name:', fname

    target = [] #ラベル
    source = [] #文書ベクトル

    #文書リストを作成
    document_list = []
    word_list = []
    for l in open(fname, 'r').readlines():
        sample = l.strip().split(' ',  1)
        label = sample[0]
        target.append([label]) #ラベル
        word_list = preprocess_string(sample[1]) #ストップワード除去, ステミング
        document_list.append(word_list) #文書ごとの単語リスト
    
    #辞書を作成
    #低頻度と高頻度のワードは除く
    dct = Dictionary(document_list)
    dct.filter_extremes(no_below=3, no_above=0.6)

    #文書のBOWでベクトル化
    for doc in document_list:
        tmp = dct.doc2bow(doc) # ex.[(4, 1), (23,1),..., (119,2)] 
        dense = list(matutils.corpus2dense([tmp], num_terms=len(dct)).T[0])
        source.append(dense)

    dataset = {}
    dataset['target'] = np.array(target)    
    dataset['source'] = np.array(source)    

    return dataset #, max_len, width
def parse_and_load_discussion_answers(course_data_path, conn, course_zip_name):
    """load, parse, process discussion answers
    """
    course_slug = course_zip_name.replace("_", "-")
    sql_select_discussion_answer = (
        "SELECT discussion_answer_id, discussion_answer_content " +
        "FROM discussion_answers, courses WHERE " +
        "discussion_answers.course_id == courses.course_id AND " +
        "courses.course_slug == (?)")

    c = conn.cursor()
    c.execute(sql_select_discussion_answer, (course_slug, ))

    course_answers = {}

    rows = c.fetchmany()
    while rows:
        for row in rows:
            answer_id, answer_content = row
            course_answers[answer_id] = preprocess_string(answer_content)
        rows = c.fetchmany()

    # save the course_answers to disk
    answers_filepath = os.path.join(course_data_path, "..",
                                    "answers.{}.json".format(course_slug))
    with open(answers_filepath, "w") as answers_file:
        json.dump(course_answers, answers_file)
示例#32
0
    def setUp(self):
        """setup lee test corpora"""
        global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2

        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
        bg_corpus_file = 'lee_background.cor'
        corpus_file = 'lee.cor'
        sim_file = 'similarities0-1.txt'

        # read in the corpora
        with open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus = preprocess_documents(f)
        with open(os.path.join(pre_path, corpus_file)) as f:
            corpus = preprocess_documents(f)
        with open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus2 = [preprocess_string(s, filters=DEFAULT_FILTERS[:-1]) for s in f]
        with open(os.path.join(pre_path, corpus_file)) as f:
            corpus2 = [preprocess_string(s, filters=DEFAULT_FILTERS[:-1]) for s in f]

        # read the human similarity data
        sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
        sim_m_size = np.shape(sim_matrix)[0]
        human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    logger = tools.get_logger('gensim', os.path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # initializations
    articles = {}
    all_missing = []
    redir_on = {}
    collisions = {}
    non_ascii = []
    site = mwclient.Site('en.wikipedia.org', '/w/api.php/')

    # get all txt files in a folder and iterate over them
    filelist = glob.glob(os.path.join(base_path,
                                      p['folder_path'],
                                      "*.txt"))
    for f in filelist:

        # get the word we are working on
        f_name = os.path.basename(f)
        k_word = os.path.splitext(f_name)[0]
        logger.info("working on file: %s" % f_name)

        # try to convert the word into ascii for the http query
        file_obj = codecs.open(f, "r", "utf-16")
        counter = 0
        words = []
        for w in file_obj.readlines():
            try:
                s = w.strip().decode('ascii')
                words.append(s)
            except Exception:
                counter += 1
                non_ascii.append(w.strip())
        logger.info("\t%d words containing non ascii are ommited" % counter)

        articles[k_word] = {}
        logger.info("\tfound %d words in file" % len(words))

        for word in words:
            data = {}
            page = site.Pages[word]

            # follow the redirect and check for collisions
            if page.redirect:
                res = re.search('\[\[(.+)\]\]', page.edit())
                redir_word = urllib.unquote(res.groups()[0])
                if redir_word in redir_on:
                    logger.warning("[%s AND %s] both redirect on --> %s" %
                                    (word, redir_on[redir_word], redir_word))
                    collisions[redir_word] = redir_on[redir_word]
                else:
                    logger.info("[%s] redir from [%s]" % (redir_word, word))
                    redir_on[redir_word] = word
                text = site.Pages[redir_word].edit()
                data['redirected'] = redir_word

            else:
                text = page.edit()

            # check for missing wikipedia articles
            if  text == "":
                all_missing.append(word)
                continue

            # preprocess the received article
            data['text'] = wikicorpus.filter_wiki(text)
            in_ascii = ud.normalize('NFKD',
                                    data['text']).encode('ascii', 'ignore')
            data['text'] = preprocess_string(in_ascii)
            articles[k_word][word] = data

    logger.info('add human rating to the articles')
    id_word = {}
    sparql_path = os.path.join(base_path, p['sparql_path'])
    with open(os.path.join(sparql_path, 'id_word.txt')) as f:
        for line in f.readlines():
            idx, word = line.strip().split('\t')
            id_word[idx] = word

    #add human rating to the wikipedia data
    not_found = []
    with open(os.path.join(sparql_path, p['human_file'])) as f:
        for line in f.readlines():
            arr = line.split()
            word = id_word[arr[0]]
            term = arr[3]
            try:
                articles[word][term]['rating'] = int(arr[4])
            except KeyError:
                not_found.append(term)
    logger.info("%d words from the ref queries not found" % len(not_found))

    f = open(os.path.join(output_dir, "articles.pickle"), 'wb')
    pickle.dump(articles, f)
    f.close

    info = {}
    info['missing'] = all_missing
    info['redirs'] = redir_on
    info['collisions'] = collisions
    info['not_found'] = not_found
    info['non_ascii'] = non_ascii
    f = open(os.path.join(output_dir, "info.pickle"), 'wb')
    pickle.dump(info, f)
    f.close

    logger.info("%d redirecting collisions (see info.pkl)" % len(collisions))
示例#34
0
            
            # download the content of the article
            
            # some redirects introduce no ascii characters 
            # TODO introduce a proper conversion of this characters
            try:
                title = title.decode('ascii')
            except Exception:
                continue
                
            query = (query_base + "&export") % title
            text    = myopener.open(query).read()
            soup    = BSS(text, convertEntities=BSS.ALL_ENTITIES)
            export  = BSS(soup.api.query.export.prettify())
            text    = BSS(export.mediawiki.page.revision.prettify())
            if text.revision.minor:
                data['text'] = wikicorpus.filterWiki(text.revision.minor.text)
            else:
                data['text'] = wikicorpus.filterWiki(text.revision.text)
            in_ascii = unicodedata.normalize('NFKD', data['text']).encode('ascii', 'ignore')
            data['text'] = preprocess_string(in_ascii)
            articles[k_word][title] = data

f = open(results_path + "sparql_wiki.pickle", 'wb')
pickle.dump(articles, f)
f.close

print sum(all_missing, [])